fix(network): wired ping probe, AP/WS and systemd notify hardening
- Add hasWiredInternetProbe and export; AP mode uses it with hasInternet - systemd-env: strip NOTIFY_SOCKET from env early; client uses unix_dgram - Strip NOTIFY_SOCKET from frpc/ttyd spawn env in watchdog and frpc - WS: pong miss debounce; AP net monitor consecutive-fail debounce Made-with: Cursor
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
'use strict';
|
||||
|
||||
const { getNotifySocket } = require('./systemd-env');
|
||||
const WebSocket = require('ws');
|
||||
const { execFileSync, execSync } = require('child_process');
|
||||
const dgram = require('dgram');
|
||||
const { execSync } = require('child_process');
|
||||
const config = require('./config');
|
||||
const log = require('./logger');
|
||||
const { getBoxId } = require('./fingerprint');
|
||||
@@ -9,13 +11,16 @@ const { collect } = require('./metrics');
|
||||
const { getDashboardInfo, resolveOpenclawConfigFile, startTtyd, FrpcManager } = require('./frpc'); // getDashboardInfo 也用于心跳中定期刷新
|
||||
const { ProvisionManager } = require('./provisioning');
|
||||
const { BtMonitor } = require('./bt-monitor');
|
||||
const { hasInternet, getLocalIps } = require('./network');
|
||||
const { hasInternet, hasWiredInternetProbe, getLocalIps } = require('./network');
|
||||
const led = require('./led');
|
||||
|
||||
const MAX_BACKOFF_MS = 60_000;
|
||||
const PONG_TIMEOUT_MS = 8_000;
|
||||
const PING_INTERVAL_MS = 10_000;
|
||||
/** 连续若干轮 ping 后仍无 pong 才判定死链(单轮易因调度/弱网误判) */
|
||||
const PONG_MISS_MAX = 3;
|
||||
const PING_INTERVAL_MS = 15_000;
|
||||
const NET_MONITOR_MS = 5_000; // AP 模式网络监视间隔
|
||||
/** AP 下 nmcli/ping 易抖动:连续多轮无上行再关 WS,避免误杀仍通的长连接 */
|
||||
const AP_NET_FAIL_MAX = 3;
|
||||
const HEARTBEAT_INTERVAL_MS = 10_000; // 心跳间隔:10 秒,用于快速感知网络状态
|
||||
const METRICS_EVERY_N = 3; // 每 N 次心跳采集一次指标(= 30 秒)
|
||||
|
||||
@@ -43,9 +48,11 @@ class ClawClient {
|
||||
// WS 层活性检测
|
||||
this._pingTimer = null;
|
||||
this._awaitingPong = false;
|
||||
this._pongMissCount = 0;
|
||||
|
||||
// AP 模式网络监视(WS 连通后每 5s 检查,断网立即 terminate)
|
||||
// AP 模式网络监视(WS 连通后每 5s 检查;连续多轮无上行才 terminate)
|
||||
this._netMonitorTimer = null;
|
||||
this._apNetFailStreak = 0;
|
||||
|
||||
// WS 连续失败计数(open 时清零)
|
||||
this._wsFailCount = 0;
|
||||
@@ -55,8 +62,10 @@ class ClawClient {
|
||||
this._certTimeError = false;
|
||||
|
||||
|
||||
// systemd watchdog
|
||||
this._sdTimer = null;
|
||||
// systemd watchdog(主进程 unix_dgram → NOTIFY_SOCKET)
|
||||
this._sdTimer = null;
|
||||
this._sdDgram = null;
|
||||
this._sdNotifyAddr = null;
|
||||
|
||||
this._setupGlobalHandlers();
|
||||
}
|
||||
@@ -179,6 +188,11 @@ class ClawClient {
|
||||
if (this._ws) this._ws.terminate();
|
||||
led.status.off(); // 进程退出,两灯全灭
|
||||
this._sdNotify('STOPPING=1');
|
||||
if (this._sdDgram) {
|
||||
try { this._sdDgram.close(); } catch (_) {}
|
||||
this._sdDgram = null;
|
||||
}
|
||||
this._sdNotifyAddr = null;
|
||||
log.info('clawd', '已停止');
|
||||
log.close();
|
||||
}
|
||||
@@ -188,8 +202,8 @@ class ClawClient {
|
||||
_connect() {
|
||||
if (this._stopped) return;
|
||||
|
||||
// AP 模式 + 无网:不建立 WS,5s 后重新检查网络
|
||||
if (this._provisionMgr && this._provisionMgr.isApMode() && !hasInternet()) {
|
||||
// AP 模式 + 无网:不建立 WS,5s 后重新检查(有线经 -I ping 仍通则建立,避免热点误挡 WS)
|
||||
if (this._provisionMgr && this._provisionMgr.isApMode() && !hasInternet() && !hasWiredInternetProbe()) {
|
||||
led.display.showAP();
|
||||
log.info('clawd', 'AP 模式无网络,5s 后重新检查...');
|
||||
this._backoff = 1_000; // 有网时立即快速重连
|
||||
@@ -226,6 +240,7 @@ class ClawClient {
|
||||
|
||||
ws.on('pong', () => {
|
||||
this._awaitingPong = false;
|
||||
this._pongMissCount = 0;
|
||||
});
|
||||
|
||||
ws.on('close', (code, reason) => {
|
||||
@@ -237,8 +252,8 @@ class ClawClient {
|
||||
log.warn('clawd', `连接断开 (${code}),失败次数=${this._wsFailCount},${this._backoff / 1000}s 后重连...`);
|
||||
if (this._hasEverConnected && this._wsFailCount >= 3) {
|
||||
const inAp = this._provisionMgr && this._provisionMgr.isApMode();
|
||||
if (inAp || !hasInternet()) {
|
||||
led.display.showAP(); // AP 模式 或 无网
|
||||
if (inAp || (!hasInternet() && !hasWiredInternetProbe())) {
|
||||
led.display.showAP(); // AP 模式 或 无网(有线探测也无则视为无上行)
|
||||
} else {
|
||||
led.display.showErr0(); // STA 模式 + 有网 但 VPS 不可达
|
||||
}
|
||||
@@ -273,9 +288,13 @@ class ClawClient {
|
||||
if (!this._ws || this._ws.readyState !== WebSocket.OPEN) return;
|
||||
|
||||
if (this._awaitingPong) {
|
||||
log.warn('clawd', 'Pong 超时,连接可能已死,主动关闭重连');
|
||||
this._ws.terminate();
|
||||
return;
|
||||
this._pongMissCount++;
|
||||
if (this._pongMissCount >= PONG_MISS_MAX) {
|
||||
log.warn('clawd', `Pong 连续 ${PONG_MISS_MAX} 次未响应,主动关闭重连`);
|
||||
this._ws.terminate();
|
||||
return;
|
||||
}
|
||||
log.warn('clawd', `Pong 超时 (${this._pongMissCount}/${PONG_MISS_MAX}),继续探测...`);
|
||||
}
|
||||
|
||||
this._awaitingPong = true;
|
||||
@@ -289,18 +308,31 @@ class ClawClient {
|
||||
this._pingTimer = null;
|
||||
}
|
||||
this._awaitingPong = false;
|
||||
this._pongMissCount = 0;
|
||||
}
|
||||
|
||||
// ── AP 模式网络监视(拔网线后 ≤5s 感知)────────────────────────────────────
|
||||
|
||||
_startNetMonitor() {
|
||||
this._clearNetMonitor();
|
||||
this._apNetFailStreak = 0;
|
||||
this._netMonitorTimer = setInterval(() => {
|
||||
if (!this._provisionMgr || !this._provisionMgr.isApMode()) return;
|
||||
if (hasInternet()) return;
|
||||
// AP 模式 + 无网,但 WS 还"活着" → 立即终止,触发 close → _connect() 进入 5s 轮询
|
||||
log.warn('clawd', 'AP 模式检测到网络断开,主动关闭 WS');
|
||||
if (!this._provisionMgr || !this._provisionMgr.isApMode()) {
|
||||
this._apNetFailStreak = 0;
|
||||
return;
|
||||
}
|
||||
if (hasInternet() || hasWiredInternetProbe()) {
|
||||
this._apNetFailStreak = 0;
|
||||
return;
|
||||
}
|
||||
this._apNetFailStreak++;
|
||||
if (this._apNetFailStreak < AP_NET_FAIL_MAX) {
|
||||
log.info('clawd', `AP 网监:无上行 (${this._apNetFailStreak}/${AP_NET_FAIL_MAX}),累计后再判定`);
|
||||
return;
|
||||
}
|
||||
log.warn('clawd', 'AP 模式检测到网络断开(已连续多次无上行),主动关闭 WS');
|
||||
led.display.showAP();
|
||||
this._apNetFailStreak = 0;
|
||||
if (this._ws) this._ws.terminate();
|
||||
}, NET_MONITOR_MS);
|
||||
}
|
||||
@@ -310,6 +342,7 @@ class ClawClient {
|
||||
clearInterval(this._netMonitorTimer);
|
||||
this._netMonitorTimer = null;
|
||||
}
|
||||
this._apNetFailStreak = 0;
|
||||
}
|
||||
|
||||
// ── 发送 connect ─────────────────────────────────────────────────────────────
|
||||
@@ -538,18 +571,35 @@ class ClawClient {
|
||||
_startSdNotify() {
|
||||
if (!SD_NOTIFY_INTERVAL) return;
|
||||
|
||||
const raw = getNotifySocket();
|
||||
if (!raw) return;
|
||||
|
||||
// 抽象套接字:NOTIFY_SOCKET 以 @ 开头,内核地址首字节为 \0(与 sd_notify 一致)
|
||||
this._sdNotifyAddr = raw.startsWith('@') ? `\0${raw.slice(1)}` : raw;
|
||||
|
||||
try {
|
||||
this._sdDgram = dgram.createSocket('unix_dgram');
|
||||
this._sdDgram.on('error', () => { /* 忽略,避免未处理 error 崩溃 */ });
|
||||
} catch (_) {
|
||||
this._sdNotifyAddr = null;
|
||||
return;
|
||||
}
|
||||
|
||||
log.debug('clawd', `systemd watchdog 启用,通知间隔 ${SD_NOTIFY_INTERVAL}ms`);
|
||||
this._sdNotify('READY=1');
|
||||
this._sdTimer = setInterval(() => this._sdNotify('WATCHDOG=1'), SD_NOTIFY_INTERVAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* 必须由本进程(主 PID)发往 NOTIFY_SOCKET;exec systemd-notify 会换 PID,
|
||||
* systemd 在 NotifyAccess=main 下会拒绝并刷屏 journal。
|
||||
*/
|
||||
_sdNotify(msg) {
|
||||
if (!process.env.NOTIFY_SOCKET) return;
|
||||
if (!this._sdDgram || !this._sdNotifyAddr) return;
|
||||
const payload = Buffer.from(msg.endsWith('\n') ? msg : `${msg}\n`);
|
||||
try {
|
||||
execFileSync('systemd-notify', ['--pid=' + process.pid, msg], { timeout: 2000 });
|
||||
} catch (_) {
|
||||
// systemd-notify 不可用时静默忽略
|
||||
}
|
||||
this._sdDgram.send(payload, 0, payload.length, this._sdNotifyAddr, () => {});
|
||||
} catch (_) { /* ignore */ }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -124,9 +124,12 @@ async function startTtyd() {
|
||||
const shell = fs.existsSync('/bin/bash') ? '/bin/bash' : '/bin/sh';
|
||||
// 以普通用户身份启动 shell(与 SSH 登录一致)
|
||||
const ttydUser = process.env.CLAWD_TTY_USER || 'sts';
|
||||
const ttyEnv = { ...process.env };
|
||||
delete ttyEnv.NOTIFY_SOCKET;
|
||||
const proc = spawn(TTYD_BIN, ['-p', String(TTYD_PORT), '-i', '127.0.0.1', '-W', '-t', 'cursorBlink=true', '/bin/su', '-', ttydUser], {
|
||||
stdio: 'ignore',
|
||||
detached: true,
|
||||
env: ttyEnv,
|
||||
});
|
||||
proc.unref();
|
||||
log.info('ttyd', `已启动,端口 ${TTYD_PORT},用户=${ttydUser}`);
|
||||
|
||||
@@ -79,6 +79,20 @@ function _tryPingInternet() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 仅经有线口 ping 公网(不依赖默认路由)。
|
||||
* AP 开启时 hasInternet() 易误判;维持 WS / 网络监视时用此兜底。
|
||||
*/
|
||||
function hasWiredInternetProbe() {
|
||||
const wired = getWiredIfaceWithCarrier();
|
||||
if (!wired) return false;
|
||||
try {
|
||||
run(`ping -c 1 -W 3 -I ${wired} 8.8.8.8`);
|
||||
return true;
|
||||
} catch (_) {}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 检测是否有互联网连接(nmcli 连通性 + ping 兜底)
|
||||
*/
|
||||
@@ -298,6 +312,7 @@ function getLocalIps() {
|
||||
module.exports = {
|
||||
hasInternet,
|
||||
hasWiredCarrier,
|
||||
hasWiredInternetProbe,
|
||||
getWiredIfaceWithCarrier,
|
||||
hasSavedWifiConnection,
|
||||
isWifiStaConnected,
|
||||
|
||||
17
lib/systemd-env.js
Normal file
17
lib/systemd-env.js
Normal file
@@ -0,0 +1,17 @@
|
||||
'use strict';
|
||||
|
||||
/**
|
||||
* 在任意子进程(nmcli、pkill、frpc、依赖库)启动前,从 process.env 摘掉 NOTIFY_SOCKET。
|
||||
* 否则子进程继承后可能向 systemd 发 sd_notify,触发「仅主 PID 可收」的 journal 刷屏。
|
||||
* 主进程通过 getNotifySocket() 取回路径,自行 unix_dgram 发送。
|
||||
*/
|
||||
const _notifySocket = process.env.NOTIFY_SOCKET;
|
||||
if (_notifySocket) {
|
||||
delete process.env.NOTIFY_SOCKET;
|
||||
}
|
||||
|
||||
function getNotifySocket() {
|
||||
return _notifySocket;
|
||||
}
|
||||
|
||||
module.exports = { getNotifySocket };
|
||||
@@ -67,9 +67,13 @@ class Watchdog {
|
||||
if (this._stopped) return;
|
||||
|
||||
log.info(this._name, '启动进程...');
|
||||
const { env: optsEnv, ...restSpawn } = this._spawnOpts;
|
||||
const env = { ...process.env, ...optsEnv };
|
||||
delete env.NOTIFY_SOCKET; // 避免 frpc 等子进程向 systemd 发 notify,触发非主 PID 拒收
|
||||
const proc = spawn(this._bin, this._args, {
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
...this._spawnOpts,
|
||||
...restSpawn,
|
||||
env,
|
||||
});
|
||||
this._proc = proc;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user