fix(systemd): pet watchdog when WATCHDOG_USEC missing; StartLimitInterval for old systemd
Embedded units may set WatchdogSec but omit WATCHDOG_USEC; SD_NOTIFY_INTERVAL was 0 so no WATCHDOG=1 was sent and systemd killed the main process after ~60s. Replace StartLimitIntervalSec with StartLimitInterval= for older systemd (journal warning). Made-with: Cursor
This commit is contained in:
@@ -195,7 +195,8 @@ WorkingDirectory=$INSTALL_DIR
|
|||||||
# 重启策略
|
# 重启策略
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=5
|
RestartSec=5
|
||||||
StartLimitIntervalSec=300
|
# 旧版 systemd 不认 StartLimitIntervalSec,用 StartLimitInterval=(秒)
|
||||||
|
StartLimitInterval=300
|
||||||
StartLimitBurst=10
|
StartLimitBurst=10
|
||||||
|
|
||||||
# 优雅停止(10s 内 SIGTERM,超时 SIGKILL)
|
# 优雅停止(10s 内 SIGTERM,超时 SIGKILL)
|
||||||
|
|||||||
@@ -24,12 +24,6 @@ const AP_NET_FAIL_MAX = 3;
|
|||||||
const HEARTBEAT_INTERVAL_MS = 10_000; // 心跳间隔:10 秒,用于快速感知网络状态
|
const HEARTBEAT_INTERVAL_MS = 10_000; // 心跳间隔:10 秒,用于快速感知网络状态
|
||||||
const METRICS_EVERY_N = 3; // 每 N 次心跳采集一次指标(= 30 秒)
|
const METRICS_EVERY_N = 3; // 每 N 次心跳采集一次指标(= 30 秒)
|
||||||
|
|
||||||
// systemd watchdog: 如果 WatchdogSec 存在,定期发 WATCHDOG=1
|
|
||||||
const SD_WATCHDOG_USEC = parseInt(process.env.WATCHDOG_USEC || '0', 10);
|
|
||||||
const SD_NOTIFY_INTERVAL = SD_WATCHDOG_USEC > 0
|
|
||||||
? Math.floor(SD_WATCHDOG_USEC / 2 / 1000) // 半周期通知(μs → ms)
|
|
||||||
: 0;
|
|
||||||
|
|
||||||
class ClawClient {
|
class ClawClient {
|
||||||
constructor() {
|
constructor() {
|
||||||
this._cfg = config.load();
|
this._cfg = config.load();
|
||||||
@@ -569,25 +563,34 @@ class ClawClient {
|
|||||||
// ── systemd Watchdog ────────────────────────────────────────────────────────
|
// ── systemd Watchdog ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
_startSdNotify() {
|
_startSdNotify() {
|
||||||
if (!SD_NOTIFY_INTERVAL) return;
|
|
||||||
|
|
||||||
const raw = getNotifySocket();
|
const raw = getNotifySocket();
|
||||||
if (!raw) return;
|
if (!raw) return;
|
||||||
|
|
||||||
|
// 部分嵌入式 systemd 有 WatchdogSec 但未注入 WATCHDOG_USEC;若此时不喂狗,约 1min 会 SIGABRT
|
||||||
|
let usec = parseInt(process.env.WATCHDOG_USEC || '0', 10);
|
||||||
|
if (usec <= 0) {
|
||||||
|
usec = 60_000_000;
|
||||||
|
log.info('clawd', 'WATCHDOG_USEC 未设置,按 60s 周期向 systemd 发送 WATCHDOG=1(与 unit WatchdogSec=60 一致)');
|
||||||
|
}
|
||||||
|
const intervalMs = Math.max(1000, Math.floor(usec / 2 / 1000));
|
||||||
|
|
||||||
// 抽象套接字:NOTIFY_SOCKET 以 @ 开头,内核地址首字节为 \0(与 sd_notify 一致)
|
// 抽象套接字:NOTIFY_SOCKET 以 @ 开头,内核地址首字节为 \0(与 sd_notify 一致)
|
||||||
this._sdNotifyAddr = raw.startsWith('@') ? `\0${raw.slice(1)}` : raw;
|
this._sdNotifyAddr = raw.startsWith('@') ? `\0${raw.slice(1)}` : raw;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
this._sdDgram = dgram.createSocket('unix_dgram');
|
this._sdDgram = dgram.createSocket('unix_dgram');
|
||||||
this._sdDgram.on('error', () => { /* 忽略,避免未处理 error 崩溃 */ });
|
this._sdDgram.on('error', (err) => {
|
||||||
} catch (_) {
|
log.warn('clawd', 'systemd notify socket:', err.message);
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
this._sdNotifyAddr = null;
|
this._sdNotifyAddr = null;
|
||||||
|
log.warn('clawd', 'systemd notify dgram 创建失败:', err.message);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
log.debug('clawd', `systemd watchdog 启用,通知间隔 ${SD_NOTIFY_INTERVAL}ms`);
|
log.debug('clawd', `systemd watchdog 启用,通知间隔 ${intervalMs}ms`);
|
||||||
this._sdNotify('READY=1');
|
this._sdNotify('READY=1');
|
||||||
this._sdTimer = setInterval(() => this._sdNotify('WATCHDOG=1'), SD_NOTIFY_INTERVAL);
|
this._sdTimer = setInterval(() => this._sdNotify('WATCHDOG=1'), intervalMs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user