From 04dd1017bbefbbbd7a9e5d5b4d2bc01c5c1c58bb Mon Sep 17 00:00:00 2001 From: stswangzhiping <59632378+stswangzhiping@users.noreply.github.com> Date: Sat, 28 Mar 2026 14:37:56 +0800 Subject: [PATCH] fix(network): wired ping probe, AP/WS and systemd notify hardening - Add hasWiredInternetProbe and export; AP mode uses it with hasInternet - systemd-env: strip NOTIFY_SOCKET from env early; client uses unix_dgram - Strip NOTIFY_SOCKET from frpc/ttyd spawn env in watchdog and frpc - WS: pong miss debounce; AP net monitor consecutive-fail debounce Made-with: Cursor --- bin/clawd.js | 3 ++ lib/client.js | 96 +++++++++++++++++++++++++++++++++++----------- lib/frpc.js | 3 ++ lib/network.js | 15 ++++++++ lib/systemd-env.js | 17 ++++++++ lib/watchdog.js | 6 ++- 6 files changed, 116 insertions(+), 24 deletions(-) create mode 100644 lib/systemd-env.js diff --git a/bin/clawd.js b/bin/clawd.js index 6c06d83..6f58c41 100644 --- a/bin/clawd.js +++ b/bin/clawd.js @@ -1,6 +1,9 @@ #!/usr/bin/env node 'use strict'; +// 先于其它模块:摘掉 NOTIFY_SOCKET,避免任意子进程误发 systemd notify +require('../lib/systemd-env'); + const { ClawClient } = require('../lib/client'); const log = require('../lib/logger'); diff --git a/lib/client.js b/lib/client.js index 339cb3c..8515620 100644 --- a/lib/client.js +++ b/lib/client.js @@ -1,7 +1,9 @@ 'use strict'; +const { getNotifySocket } = require('./systemd-env'); const WebSocket = require('ws'); -const { execFileSync, execSync } = require('child_process'); +const dgram = require('dgram'); +const { execSync } = require('child_process'); const config = require('./config'); const log = require('./logger'); const { getBoxId } = require('./fingerprint'); @@ -9,13 +11,16 @@ const { collect } = require('./metrics'); const { getDashboardInfo, resolveOpenclawConfigFile, startTtyd, FrpcManager } = require('./frpc'); // getDashboardInfo 也用于心跳中定期刷新 const { ProvisionManager } = require('./provisioning'); const { BtMonitor } = require('./bt-monitor'); -const { hasInternet, getLocalIps } = require('./network'); +const { hasInternet, hasWiredInternetProbe, getLocalIps } = require('./network'); const led = require('./led'); const MAX_BACKOFF_MS = 60_000; -const PONG_TIMEOUT_MS = 8_000; -const PING_INTERVAL_MS = 10_000; +/** 连续若干轮 ping 后仍无 pong 才判定死链(单轮易因调度/弱网误判) */ +const PONG_MISS_MAX = 3; +const PING_INTERVAL_MS = 15_000; const NET_MONITOR_MS = 5_000; // AP 模式网络监视间隔 +/** AP 下 nmcli/ping 易抖动:连续多轮无上行再关 WS,避免误杀仍通的长连接 */ +const AP_NET_FAIL_MAX = 3; const HEARTBEAT_INTERVAL_MS = 10_000; // 心跳间隔:10 秒,用于快速感知网络状态 const METRICS_EVERY_N = 3; // 每 N 次心跳采集一次指标(= 30 秒) @@ -43,9 +48,11 @@ class ClawClient { // WS 层活性检测 this._pingTimer = null; this._awaitingPong = false; + this._pongMissCount = 0; - // AP 模式网络监视(WS 连通后每 5s 检查,断网立即 terminate) + // AP 模式网络监视(WS 连通后每 5s 检查;连续多轮无上行才 terminate) this._netMonitorTimer = null; + this._apNetFailStreak = 0; // WS 连续失败计数(open 时清零) this._wsFailCount = 0; @@ -55,8 +62,10 @@ class ClawClient { this._certTimeError = false; - // systemd watchdog - this._sdTimer = null; + // systemd watchdog(主进程 unix_dgram → NOTIFY_SOCKET) + this._sdTimer = null; + this._sdDgram = null; + this._sdNotifyAddr = null; this._setupGlobalHandlers(); } @@ -179,6 +188,11 @@ class ClawClient { if (this._ws) this._ws.terminate(); led.status.off(); // 进程退出,两灯全灭 this._sdNotify('STOPPING=1'); + if (this._sdDgram) { + try { this._sdDgram.close(); } catch (_) {} + this._sdDgram = null; + } + this._sdNotifyAddr = null; log.info('clawd', '已停止'); log.close(); } @@ -188,8 +202,8 @@ class ClawClient { _connect() { if (this._stopped) return; - // AP 模式 + 无网:不建立 WS,5s 后重新检查网络 - if (this._provisionMgr && this._provisionMgr.isApMode() && !hasInternet()) { + // AP 模式 + 无网:不建立 WS,5s 后重新检查(有线经 -I ping 仍通则建立,避免热点误挡 WS) + if (this._provisionMgr && this._provisionMgr.isApMode() && !hasInternet() && !hasWiredInternetProbe()) { led.display.showAP(); log.info('clawd', 'AP 模式无网络,5s 后重新检查...'); this._backoff = 1_000; // 有网时立即快速重连 @@ -226,6 +240,7 @@ class ClawClient { ws.on('pong', () => { this._awaitingPong = false; + this._pongMissCount = 0; }); ws.on('close', (code, reason) => { @@ -237,8 +252,8 @@ class ClawClient { log.warn('clawd', `连接断开 (${code}),失败次数=${this._wsFailCount},${this._backoff / 1000}s 后重连...`); if (this._hasEverConnected && this._wsFailCount >= 3) { const inAp = this._provisionMgr && this._provisionMgr.isApMode(); - if (inAp || !hasInternet()) { - led.display.showAP(); // AP 模式 或 无网 + if (inAp || (!hasInternet() && !hasWiredInternetProbe())) { + led.display.showAP(); // AP 模式 或 无网(有线探测也无则视为无上行) } else { led.display.showErr0(); // STA 模式 + 有网 但 VPS 不可达 } @@ -273,9 +288,13 @@ class ClawClient { if (!this._ws || this._ws.readyState !== WebSocket.OPEN) return; if (this._awaitingPong) { - log.warn('clawd', 'Pong 超时,连接可能已死,主动关闭重连'); - this._ws.terminate(); - return; + this._pongMissCount++; + if (this._pongMissCount >= PONG_MISS_MAX) { + log.warn('clawd', `Pong 连续 ${PONG_MISS_MAX} 次未响应,主动关闭重连`); + this._ws.terminate(); + return; + } + log.warn('clawd', `Pong 超时 (${this._pongMissCount}/${PONG_MISS_MAX}),继续探测...`); } this._awaitingPong = true; @@ -289,18 +308,31 @@ class ClawClient { this._pingTimer = null; } this._awaitingPong = false; + this._pongMissCount = 0; } // ── AP 模式网络监视(拔网线后 ≤5s 感知)──────────────────────────────────── _startNetMonitor() { this._clearNetMonitor(); + this._apNetFailStreak = 0; this._netMonitorTimer = setInterval(() => { - if (!this._provisionMgr || !this._provisionMgr.isApMode()) return; - if (hasInternet()) return; - // AP 模式 + 无网,但 WS 还"活着" → 立即终止,触发 close → _connect() 进入 5s 轮询 - log.warn('clawd', 'AP 模式检测到网络断开,主动关闭 WS'); + if (!this._provisionMgr || !this._provisionMgr.isApMode()) { + this._apNetFailStreak = 0; + return; + } + if (hasInternet() || hasWiredInternetProbe()) { + this._apNetFailStreak = 0; + return; + } + this._apNetFailStreak++; + if (this._apNetFailStreak < AP_NET_FAIL_MAX) { + log.info('clawd', `AP 网监:无上行 (${this._apNetFailStreak}/${AP_NET_FAIL_MAX}),累计后再判定`); + return; + } + log.warn('clawd', 'AP 模式检测到网络断开(已连续多次无上行),主动关闭 WS'); led.display.showAP(); + this._apNetFailStreak = 0; if (this._ws) this._ws.terminate(); }, NET_MONITOR_MS); } @@ -310,6 +342,7 @@ class ClawClient { clearInterval(this._netMonitorTimer); this._netMonitorTimer = null; } + this._apNetFailStreak = 0; } // ── 发送 connect ───────────────────────────────────────────────────────────── @@ -538,18 +571,35 @@ class ClawClient { _startSdNotify() { if (!SD_NOTIFY_INTERVAL) return; + const raw = getNotifySocket(); + if (!raw) return; + + // 抽象套接字:NOTIFY_SOCKET 以 @ 开头,内核地址首字节为 \0(与 sd_notify 一致) + this._sdNotifyAddr = raw.startsWith('@') ? `\0${raw.slice(1)}` : raw; + + try { + this._sdDgram = dgram.createSocket('unix_dgram'); + this._sdDgram.on('error', () => { /* 忽略,避免未处理 error 崩溃 */ }); + } catch (_) { + this._sdNotifyAddr = null; + return; + } + log.debug('clawd', `systemd watchdog 启用,通知间隔 ${SD_NOTIFY_INTERVAL}ms`); this._sdNotify('READY=1'); this._sdTimer = setInterval(() => this._sdNotify('WATCHDOG=1'), SD_NOTIFY_INTERVAL); } + /** + * 必须由本进程(主 PID)发往 NOTIFY_SOCKET;exec systemd-notify 会换 PID, + * systemd 在 NotifyAccess=main 下会拒绝并刷屏 journal。 + */ _sdNotify(msg) { - if (!process.env.NOTIFY_SOCKET) return; + if (!this._sdDgram || !this._sdNotifyAddr) return; + const payload = Buffer.from(msg.endsWith('\n') ? msg : `${msg}\n`); try { - execFileSync('systemd-notify', ['--pid=' + process.pid, msg], { timeout: 2000 }); - } catch (_) { - // systemd-notify 不可用时静默忽略 - } + this._sdDgram.send(payload, 0, payload.length, this._sdNotifyAddr, () => {}); + } catch (_) { /* ignore */ } } } diff --git a/lib/frpc.js b/lib/frpc.js index 0c83594..b2d98ca 100644 --- a/lib/frpc.js +++ b/lib/frpc.js @@ -124,9 +124,12 @@ async function startTtyd() { const shell = fs.existsSync('/bin/bash') ? '/bin/bash' : '/bin/sh'; // 以普通用户身份启动 shell(与 SSH 登录一致) const ttydUser = process.env.CLAWD_TTY_USER || 'sts'; + const ttyEnv = { ...process.env }; + delete ttyEnv.NOTIFY_SOCKET; const proc = spawn(TTYD_BIN, ['-p', String(TTYD_PORT), '-i', '127.0.0.1', '-W', '-t', 'cursorBlink=true', '/bin/su', '-', ttydUser], { stdio: 'ignore', detached: true, + env: ttyEnv, }); proc.unref(); log.info('ttyd', `已启动,端口 ${TTYD_PORT},用户=${ttydUser}`); diff --git a/lib/network.js b/lib/network.js index 7ae0577..a93e441 100644 --- a/lib/network.js +++ b/lib/network.js @@ -79,6 +79,20 @@ function _tryPingInternet() { return false; } +/** + * 仅经有线口 ping 公网(不依赖默认路由)。 + * AP 开启时 hasInternet() 易误判;维持 WS / 网络监视时用此兜底。 + */ +function hasWiredInternetProbe() { + const wired = getWiredIfaceWithCarrier(); + if (!wired) return false; + try { + run(`ping -c 1 -W 3 -I ${wired} 8.8.8.8`); + return true; + } catch (_) {} + return false; +} + /** * 检测是否有互联网连接(nmcli 连通性 + ping 兜底) */ @@ -298,6 +312,7 @@ function getLocalIps() { module.exports = { hasInternet, hasWiredCarrier, + hasWiredInternetProbe, getWiredIfaceWithCarrier, hasSavedWifiConnection, isWifiStaConnected, diff --git a/lib/systemd-env.js b/lib/systemd-env.js new file mode 100644 index 0000000..50f9672 --- /dev/null +++ b/lib/systemd-env.js @@ -0,0 +1,17 @@ +'use strict'; + +/** + * 在任意子进程(nmcli、pkill、frpc、依赖库)启动前,从 process.env 摘掉 NOTIFY_SOCKET。 + * 否则子进程继承后可能向 systemd 发 sd_notify,触发「仅主 PID 可收」的 journal 刷屏。 + * 主进程通过 getNotifySocket() 取回路径,自行 unix_dgram 发送。 + */ +const _notifySocket = process.env.NOTIFY_SOCKET; +if (_notifySocket) { + delete process.env.NOTIFY_SOCKET; +} + +function getNotifySocket() { + return _notifySocket; +} + +module.exports = { getNotifySocket }; diff --git a/lib/watchdog.js b/lib/watchdog.js index dc9c2bd..c589c97 100644 --- a/lib/watchdog.js +++ b/lib/watchdog.js @@ -67,9 +67,13 @@ class Watchdog { if (this._stopped) return; log.info(this._name, '启动进程...'); + const { env: optsEnv, ...restSpawn } = this._spawnOpts; + const env = { ...process.env, ...optsEnv }; + delete env.NOTIFY_SOCKET; // 避免 frpc 等子进程向 systemd 发 notify,触发非主 PID 拒收 const proc = spawn(this._bin, this._args, { stdio: ['ignore', 'pipe', 'pipe'], - ...this._spawnOpts, + ...restSpawn, + env, }); this._proc = proc;