Fix write-check by disabling verification by default, since it isn't working yet. Also add tests
This commit is contained in:
parent
dc5023d272
commit
ef7463a29a
@ -9,6 +9,8 @@ export LOG_LEVEL=info
|
||||
|
||||
# Optional write confirmation probe settings:
|
||||
export WRITE_CHECK_ENABLED=true
|
||||
# false = consider publish OK sufficient; true = require read-after-write confirmation
|
||||
export WRITE_CHECK_VERIFY_READ=false
|
||||
export WRITE_CHECK_KIND=30078
|
||||
# WRITE_CHECK_PRIVKEY accepts either:
|
||||
# - nsec1... string
|
||||
|
||||
30
README.md
30
README.md
@ -7,6 +7,7 @@ Production-oriented Prometheus exporter for monitoring a fixed set of Nostr rela
|
||||
- Probes each configured relay either on an interval, or on-demand when `/metrics` is scraped.
|
||||
- Uses `@nostrwatch/nocap` for `open` + `read` checks.
|
||||
- Optionally performs a low-noise write confirmation check (kind `30078` by default) using deterministic `d` tags.
|
||||
- By default, treats relay `OK` for publish as success; optional read-after-write verification can be enabled.
|
||||
- Exposes:
|
||||
- `/metrics` for Prometheus scraping
|
||||
- `/healthz` for process and probe-loop health
|
||||
@ -48,6 +49,7 @@ Environment variables:
|
||||
- `PORT` (default: `9464`)
|
||||
- `LOG_LEVEL` (default: `info`; one of `debug|info|warn|error`)
|
||||
- `WRITE_CHECK_ENABLED` (default: `true`)
|
||||
- `WRITE_CHECK_VERIFY_READ` (default: `false`; when `true`, require event read-back after publish)
|
||||
- `WRITE_CHECK_KIND` (default: `30078`)
|
||||
- `WRITE_CHECK_PRIVKEY` (optional; supports `nsec1...` or 64-char hex)
|
||||
|
||||
@ -79,6 +81,34 @@ Run tests:
|
||||
pnpm test
|
||||
```
|
||||
|
||||
Run the live write-confirm diagnostic for `offchain.pub` (opt-in):
|
||||
|
||||
```bash
|
||||
LIVE_RELAY_TEST_OFFCHAIN=1 \
|
||||
LIVE_RELAY_TEST_RELAYS="wss://offchain.pub" \
|
||||
pnpm test test/live.offchain.test.ts
|
||||
```
|
||||
|
||||
Full example with write verification enabled:
|
||||
|
||||
```bash
|
||||
LIVE_RELAY_TEST_OFFCHAIN=1 \
|
||||
LIVE_RELAY_TEST_RELAYS="wss://offchain.pub" \
|
||||
LIVE_RELAY_TEST_WRITE_VERIFY_READ=1 \
|
||||
LIVE_RELAY_TEST_TIMEOUT_SECONDS=8 \
|
||||
LIVE_RELAY_TEST_EXPECT_NO_FAILURES=1 \
|
||||
pnpm test test/live.offchain.test.ts
|
||||
```
|
||||
|
||||
Optional knobs:
|
||||
|
||||
- `LIVE_RELAY_TEST_SAMPLES` (default `12`)
|
||||
- `LIVE_RELAY_TEST_SCRAPE_EVERY_MS` (default `1500`)
|
||||
- `LIVE_RELAY_TEST_TIMEOUT_SECONDS` (default `8`)
|
||||
- `LIVE_RELAY_TEST_RELAYS` (default `"wss://offchain.pub"`; comma-separated relay list)
|
||||
- `LIVE_RELAY_TEST_WRITE_VERIFY_READ=1` to enable read-after-write verification (disabled by default)
|
||||
- `LIVE_RELAY_TEST_EXPECT_NO_FAILURES=1` to make the test fail on any write-confirm/probe failures
|
||||
|
||||
## Exposed metrics
|
||||
|
||||
Relay-level labels use `{relay}` unless stated:
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
"dev": "tsx watch src/index.ts",
|
||||
"build": "tsc -p tsconfig.json",
|
||||
"start": "node dist/index.js",
|
||||
"test": "tsx --test test/config.test.ts test/smoke.test.ts"
|
||||
"test": "tsx --test test/config.test.ts test/smoke.test.ts test/live.offchain.test.ts"
|
||||
},
|
||||
"keywords": [
|
||||
"nostr",
|
||||
|
||||
@ -10,6 +10,7 @@ export interface RelayTarget {
|
||||
|
||||
export interface WriteCheckConfig {
|
||||
enabled: boolean;
|
||||
verifyRead: boolean;
|
||||
kind: number;
|
||||
privkey?: string;
|
||||
}
|
||||
@ -34,6 +35,7 @@ const DEFAULT_LISTEN_ADDR = "0.0.0.0";
|
||||
const DEFAULT_PORT = 9464;
|
||||
const DEFAULT_LOG_LEVEL: LogLevel = "info";
|
||||
const DEFAULT_WRITE_CHECK_ENABLED = true;
|
||||
const DEFAULT_WRITE_CHECK_VERIFY_READ = false;
|
||||
const DEFAULT_WRITE_CHECK_KIND = 30078;
|
||||
|
||||
const MIN_TIMEOUT_SECONDS = 1;
|
||||
@ -170,6 +172,10 @@ export function loadConfig(env: NodeJS.ProcessEnv = process.env): AppConfig {
|
||||
}
|
||||
|
||||
const writeEnabledFlag = parseBoolean(env.WRITE_CHECK_ENABLED, DEFAULT_WRITE_CHECK_ENABLED);
|
||||
const writeVerifyReadFlag = parseBoolean(
|
||||
env.WRITE_CHECK_VERIFY_READ,
|
||||
DEFAULT_WRITE_CHECK_VERIFY_READ,
|
||||
);
|
||||
const writeCheckKind = parseInteger(
|
||||
env.WRITE_CHECK_KIND,
|
||||
DEFAULT_WRITE_CHECK_KIND,
|
||||
@ -200,6 +206,7 @@ export function loadConfig(env: NodeJS.ProcessEnv = process.env): AppConfig {
|
||||
const staleAfterMs = probeIntervalMs + probeTimeoutMs + 5_000;
|
||||
const writeCheck: WriteCheckConfig = {
|
||||
enabled: writeEnabledFlag,
|
||||
verifyRead: writeVerifyReadFlag,
|
||||
kind: writeCheckKind,
|
||||
...(resolvedWritePrivkey ? { privkey: resolvedWritePrivkey } : {}),
|
||||
};
|
||||
|
||||
@ -23,6 +23,7 @@ interface RelayCheckNode {
|
||||
interface NocapResult {
|
||||
open?: RelayCheckNode;
|
||||
read?: RelayCheckNode;
|
||||
write?: RelayCheckNode;
|
||||
}
|
||||
|
||||
interface RelayProbeState {
|
||||
@ -192,8 +193,13 @@ export class RelayProber {
|
||||
const adapters = Object.values(NocapEveryAdapterDefault);
|
||||
await nocap.useAdapters(adapters);
|
||||
|
||||
const checks: string[] =
|
||||
this.config.writeCheck.enabled && !this.config.writeCheck.verifyRead
|
||||
? ["open", "read", "write"]
|
||||
: ["open", "read"];
|
||||
|
||||
const result = (await withTimeout(
|
||||
nocap.check(["open", "read"], true) as Promise<NocapResult>,
|
||||
nocap.check(checks, true) as Promise<NocapResult>,
|
||||
this.config.probeTimeoutMs * 2,
|
||||
"nocap open/read",
|
||||
)) as NocapResult;
|
||||
@ -211,27 +217,49 @@ export class RelayProber {
|
||||
}
|
||||
|
||||
if (this.config.writeCheck.enabled) {
|
||||
const privkey = this.config.writeCheck.privkey;
|
||||
if (!privkey) {
|
||||
writeConfirmOk = false;
|
||||
this.metrics.incProbeError(relayLabel, "write_confirm");
|
||||
} else {
|
||||
const writeResult = await runWriteConfirm({
|
||||
relay: relay.relay,
|
||||
host: relay.host,
|
||||
kind: this.config.writeCheck.kind,
|
||||
privkey,
|
||||
timeoutMs: this.config.probeTimeoutMs,
|
||||
});
|
||||
writeConfirmOk = writeResult.ok;
|
||||
writeDurationMs = durationOrDefault(writeResult.durationMs);
|
||||
if (!writeResult.ok) {
|
||||
if (!this.config.writeCheck.verifyRead) {
|
||||
// Match nocap behavior: use websocket write check in the same nocap session.
|
||||
const writeNode = result.write;
|
||||
writeConfirmOk =
|
||||
writeNode !== undefined &&
|
||||
writeNode.status !== "error" &&
|
||||
writeNode.data !== false;
|
||||
writeDurationMs = durationOrDefault(result.write?.duration);
|
||||
if (!writeConfirmOk) {
|
||||
this.metrics.incProbeError(relayLabel, "write_confirm");
|
||||
this.logger.warn(
|
||||
{ relay: relayLabel, check: "write_confirm", reason: writeResult.reason },
|
||||
{
|
||||
relay: relayLabel,
|
||||
check: "write_confirm",
|
||||
reason: result.write?.message ?? "nocap write check failed",
|
||||
},
|
||||
"write confirmation failed",
|
||||
);
|
||||
}
|
||||
} else {
|
||||
const privkey = this.config.writeCheck.privkey;
|
||||
if (!privkey) {
|
||||
writeConfirmOk = false;
|
||||
this.metrics.incProbeError(relayLabel, "write_confirm");
|
||||
} else {
|
||||
const writeResult = await runWriteConfirm({
|
||||
relay: relay.relay,
|
||||
host: relay.host,
|
||||
kind: this.config.writeCheck.kind,
|
||||
privkey,
|
||||
verifyRead: this.config.writeCheck.verifyRead,
|
||||
timeoutMs: this.config.probeTimeoutMs,
|
||||
});
|
||||
writeConfirmOk = writeResult.ok;
|
||||
writeDurationMs = durationOrDefault(writeResult.durationMs);
|
||||
if (!writeResult.ok) {
|
||||
this.metrics.incProbeError(relayLabel, "write_confirm");
|
||||
this.logger.warn(
|
||||
{ relay: relayLabel, check: "write_confirm", reason: writeResult.reason },
|
||||
"write confirmation failed",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ interface WriteConfirmInput {
|
||||
host: string;
|
||||
kind: number;
|
||||
privkey: string;
|
||||
verifyRead: boolean;
|
||||
timeoutMs: number;
|
||||
}
|
||||
|
||||
@ -108,6 +109,13 @@ export async function runWriteConfirm(input: WriteConfirmInput): Promise<WriteCo
|
||||
|
||||
await withTimeout(relay.publish(event), input.timeoutMs, "relay publish");
|
||||
|
||||
if (!input.verifyRead) {
|
||||
return {
|
||||
ok: true,
|
||||
durationMs: Math.round(performance.now() - startedAt),
|
||||
};
|
||||
}
|
||||
|
||||
const elapsedMs = performance.now() - startedAt;
|
||||
const remainingMs = Math.max(1, input.timeoutMs - Math.floor(elapsedMs));
|
||||
|
||||
|
||||
@ -3,15 +3,14 @@ import assert from "node:assert/strict";
|
||||
|
||||
import { loadConfig } from "../src/config.js";
|
||||
|
||||
test("loadConfig parses valid relays and ignores invalid entries", () => {
|
||||
test("loadConfig parses relay input and ignores invalid entries", () => {
|
||||
const config = loadConfig({
|
||||
RELAYS: "wss://relay.damus.io,not-a-url,wss://nos.lol",
|
||||
RELAYS: "wss://offchain.pub,not-a-url,wss://offchain.pub",
|
||||
WRITE_CHECK_ENABLED: "false",
|
||||
});
|
||||
|
||||
assert.equal(config.relays.length, 2);
|
||||
assert.equal(config.relays[0]?.relay, "wss://relay.damus.io/");
|
||||
assert.equal(config.relays[1]?.relay, "wss://nos.lol/");
|
||||
assert.equal(config.relays.length, 1);
|
||||
assert.equal(config.relays[0]?.relay, "wss://offchain.pub/");
|
||||
assert.equal(config.writeCheck.enabled, false);
|
||||
assert.equal(config.probeIntervalMs, 0);
|
||||
assert.ok(config.warnings.some((w) => w.includes("Ignoring invalid relay URL")));
|
||||
@ -19,7 +18,7 @@ test("loadConfig parses valid relays and ignores invalid entries", () => {
|
||||
|
||||
test("loadConfig generates write-check private key when missing", () => {
|
||||
const config = loadConfig({
|
||||
RELAYS: "wss://relay.damus.io",
|
||||
RELAYS: "wss://offchain.pub",
|
||||
WRITE_CHECK_ENABLED: "true",
|
||||
});
|
||||
|
||||
@ -33,7 +32,7 @@ test("loadConfig generates write-check private key when missing", () => {
|
||||
|
||||
test("loadConfig generates write-check private key when configured key is invalid", () => {
|
||||
const config = loadConfig({
|
||||
RELAYS: "wss://relay.damus.io",
|
||||
RELAYS: "wss://offchain.pub",
|
||||
WRITE_CHECK_ENABLED: "true",
|
||||
WRITE_CHECK_PRIVKEY: "not-a-valid-key",
|
||||
});
|
||||
@ -46,3 +45,16 @@ test("loadConfig generates write-check private key when configured key is invali
|
||||
"expected warning about invalid configured key",
|
||||
);
|
||||
});
|
||||
|
||||
test("loadConfig sets write-check read verification flag", () => {
|
||||
const defaultConfig = loadConfig({
|
||||
RELAYS: "wss://offchain.pub",
|
||||
});
|
||||
assert.equal(defaultConfig.writeCheck.verifyRead, false);
|
||||
|
||||
const verifyConfig = loadConfig({
|
||||
RELAYS: "wss://offchain.pub",
|
||||
WRITE_CHECK_VERIFY_READ: "true",
|
||||
});
|
||||
assert.equal(verifyConfig.writeCheck.verifyRead, true);
|
||||
});
|
||||
|
||||
180
test/live.offchain.test.ts
Normal file
180
test/live.offchain.test.ts
Normal file
@ -0,0 +1,180 @@
|
||||
import test from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import { spawn, type ChildProcessByStdio } from "node:child_process";
|
||||
import http from "node:http";
|
||||
import type { Readable } from "node:stream";
|
||||
import { setTimeout as sleep } from "node:timers/promises";
|
||||
|
||||
const ENABLED = process.env.LIVE_RELAY_TEST_OFFCHAIN === "1";
|
||||
const SCRAPE_SAMPLES = Number.parseInt(process.env.LIVE_RELAY_TEST_SAMPLES ?? "12", 10);
|
||||
const SCRAPE_EVERY_MS = Number.parseInt(process.env.LIVE_RELAY_TEST_SCRAPE_EVERY_MS ?? "1500", 10);
|
||||
const PROBE_TIMEOUT_SECONDS = Number.parseInt(process.env.LIVE_RELAY_TEST_TIMEOUT_SECONDS ?? "8", 10);
|
||||
const WRITE_VERIFY_READ = process.env.LIVE_RELAY_TEST_WRITE_VERIFY_READ === "1";
|
||||
const ENFORCE_NO_WRITE_CONFIRM_FAILURES = process.env.LIVE_RELAY_TEST_EXPECT_NO_FAILURES === "1";
|
||||
|
||||
function randomPort(): number {
|
||||
return 20_000 + Math.floor(Math.random() * 10_000);
|
||||
}
|
||||
|
||||
function httpGet(
|
||||
port: number,
|
||||
path: string,
|
||||
): Promise<{ statusCode: number; body: string; headers: http.IncomingHttpHeaders }> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const req = http.request(
|
||||
{
|
||||
hostname: "127.0.0.1",
|
||||
port,
|
||||
path,
|
||||
method: "GET",
|
||||
timeout: 5_000,
|
||||
},
|
||||
(res) => {
|
||||
const chunks: Buffer[] = [];
|
||||
res.on("data", (chunk) => {
|
||||
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
||||
});
|
||||
res.on("end", () => {
|
||||
resolve({
|
||||
statusCode: res.statusCode ?? 0,
|
||||
body: Buffer.concat(chunks).toString("utf8"),
|
||||
headers: res.headers,
|
||||
});
|
||||
});
|
||||
},
|
||||
);
|
||||
req.on("timeout", () => {
|
||||
req.destroy(new Error(`timeout GET ${path}`));
|
||||
});
|
||||
req.on("error", reject);
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
type ExporterChild = ChildProcessByStdio<null, Readable, Readable>;
|
||||
|
||||
async function waitForStartup(child: ExporterChild, port: number, timeoutMs: number): Promise<void> {
|
||||
const started = Date.now();
|
||||
while (Date.now() - started < timeoutMs) {
|
||||
if (child.exitCode !== null) {
|
||||
throw new Error(`exporter exited early with code ${child.exitCode}`);
|
||||
}
|
||||
try {
|
||||
const res = await httpGet(port, "/healthz");
|
||||
if ([200, 503].includes(res.statusCode)) return;
|
||||
} catch {
|
||||
// server not up yet
|
||||
}
|
||||
await sleep(150);
|
||||
}
|
||||
throw new Error("timeout waiting for exporter startup");
|
||||
}
|
||||
|
||||
function metricValue(metrics: string, name: string, labels: Record<string, string>): number | null {
|
||||
const entries = Object.entries(labels);
|
||||
const labelsPattern = entries
|
||||
.map(([k, v]) => `${k}="${v.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}"`)
|
||||
.join(",");
|
||||
const linePattern = new RegExp(`^${name}\\{${labelsPattern}\\} (-?\\d+(?:\\.\\d+)?)$`, "m");
|
||||
const match = metrics.match(linePattern);
|
||||
if (!match?.[1]) return null;
|
||||
const parsed = Number.parseFloat(match[1]);
|
||||
return Number.isFinite(parsed) ? parsed : null;
|
||||
}
|
||||
|
||||
test(
|
||||
"live exporter probe diagnostic against offchain.pub",
|
||||
{ skip: !ENABLED },
|
||||
async (t) => {
|
||||
const relayInput = process.env.LIVE_RELAY_TEST_RELAYS ?? "wss://offchain.pub";
|
||||
const relays = relayInput
|
||||
.split(",")
|
||||
.map((v) => v.trim())
|
||||
.filter((v) => v.length > 0);
|
||||
assert.ok(relays.length > 0, "LIVE_RELAY_TEST_RELAYS must include at least one relay");
|
||||
|
||||
let relayLabel: string;
|
||||
try {
|
||||
relayLabel = new URL(relays[0]).toString();
|
||||
} catch {
|
||||
assert.fail(`invalid first relay URL in LIVE_RELAY_TEST_RELAYS: ${relays[0]}`);
|
||||
}
|
||||
|
||||
const port = randomPort();
|
||||
const child = spawn("pnpm", ["exec", "tsx", "src/index.ts"], {
|
||||
cwd: process.cwd(),
|
||||
env: {
|
||||
...process.env,
|
||||
RELAYS: relays.join(","),
|
||||
PROBE_INTERVAL_SECONDS: "1",
|
||||
PROBE_TIMEOUT_SECONDS: String(PROBE_TIMEOUT_SECONDS),
|
||||
WRITE_CHECK_ENABLED: "true",
|
||||
WRITE_CHECK_VERIFY_READ: WRITE_VERIFY_READ ? "true" : "false",
|
||||
PORT: String(port),
|
||||
LOG_LEVEL: "error",
|
||||
},
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
let stderr = "";
|
||||
child.stderr.on("data", (d: Buffer) => {
|
||||
stderr += d.toString("utf8");
|
||||
});
|
||||
|
||||
t.after(() => {
|
||||
if (child.exitCode === null) {
|
||||
child.kill("SIGTERM");
|
||||
}
|
||||
});
|
||||
|
||||
await waitForStartup(child, port, 10_000);
|
||||
|
||||
let lastMetrics = "";
|
||||
for (let i = 0; i < SCRAPE_SAMPLES; i += 1) {
|
||||
const metrics = await httpGet(port, "/metrics");
|
||||
assert.equal(metrics.statusCode, 200);
|
||||
lastMetrics = metrics.body;
|
||||
await sleep(SCRAPE_EVERY_MS);
|
||||
}
|
||||
|
||||
const writeConfirmOk = metricValue(lastMetrics, "nostr_relay_write_confirm_ok", { relay: relayLabel });
|
||||
const writeConfirmErrors = metricValue(lastMetrics, "nostr_relay_probe_errors_total", {
|
||||
relay: relayLabel,
|
||||
check: "write_confirm",
|
||||
});
|
||||
const successRuns = metricValue(lastMetrics, "nostr_relay_probe_runs_total", {
|
||||
relay: relayLabel,
|
||||
result: "success",
|
||||
});
|
||||
const failureRuns = metricValue(lastMetrics, "nostr_relay_probe_runs_total", {
|
||||
relay: relayLabel,
|
||||
result: "failure",
|
||||
});
|
||||
|
||||
const diagnostic = {
|
||||
relay: relayLabel,
|
||||
relays,
|
||||
scrapeSamples: SCRAPE_SAMPLES,
|
||||
scrapeEveryMs: SCRAPE_EVERY_MS,
|
||||
writeVerifyRead: WRITE_VERIFY_READ,
|
||||
writeConfirmOk,
|
||||
writeConfirmErrors,
|
||||
successRuns,
|
||||
failureRuns,
|
||||
crashed: child.exitCode !== null,
|
||||
};
|
||||
console.log("[live-offchain-exporter]", JSON.stringify(diagnostic));
|
||||
|
||||
assert.equal(child.exitCode, null, `exporter crashed unexpectedly: ${stderr}`);
|
||||
assert.notEqual(writeConfirmOk, null, "missing nostr_relay_write_confirm_ok metric");
|
||||
assert.ok(
|
||||
successRuns !== null || failureRuns !== null,
|
||||
"missing nostr_relay_probe_runs_total metrics",
|
||||
);
|
||||
|
||||
if (ENFORCE_NO_WRITE_CONFIRM_FAILURES) {
|
||||
assert.equal(writeConfirmErrors ?? 0, 0, `write_confirm failures seen: ${JSON.stringify(diagnostic)}`);
|
||||
assert.equal(failureRuns ?? 0, 0, `probe failures seen: ${JSON.stringify(diagnostic)}`);
|
||||
}
|
||||
},
|
||||
);
|
||||
Loading…
x
Reference in New Issue
Block a user