diff --git a/.env.example b/.env.example index 1ff6263..e55a496 100644 --- a/.env.example +++ b/.env.example @@ -9,6 +9,8 @@ export LOG_LEVEL=info # Optional write confirmation probe settings: export WRITE_CHECK_ENABLED=true +# false = consider publish OK sufficient; true = require read-after-write confirmation +export WRITE_CHECK_VERIFY_READ=false export WRITE_CHECK_KIND=30078 # WRITE_CHECK_PRIVKEY accepts either: # - nsec1... string diff --git a/README.md b/README.md index 87b4878..4ac5f4a 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Production-oriented Prometheus exporter for monitoring a fixed set of Nostr rela - Probes each configured relay either on an interval, or on-demand when `/metrics` is scraped. - Uses `@nostrwatch/nocap` for `open` + `read` checks. - Optionally performs a low-noise write confirmation check (kind `30078` by default) using deterministic `d` tags. +- By default, treats relay `OK` for publish as success; optional read-after-write verification can be enabled. - Exposes: - `/metrics` for Prometheus scraping - `/healthz` for process and probe-loop health @@ -48,6 +49,7 @@ Environment variables: - `PORT` (default: `9464`) - `LOG_LEVEL` (default: `info`; one of `debug|info|warn|error`) - `WRITE_CHECK_ENABLED` (default: `true`) +- `WRITE_CHECK_VERIFY_READ` (default: `false`; when `true`, require event read-back after publish) - `WRITE_CHECK_KIND` (default: `30078`) - `WRITE_CHECK_PRIVKEY` (optional; supports `nsec1...` or 64-char hex) @@ -79,6 +81,34 @@ Run tests: pnpm test ``` +Run the live write-confirm diagnostic for `offchain.pub` (opt-in): + +```bash +LIVE_RELAY_TEST_OFFCHAIN=1 \ +LIVE_RELAY_TEST_RELAYS="wss://offchain.pub" \ +pnpm test test/live.offchain.test.ts +``` + +Full example with write verification enabled: + +```bash +LIVE_RELAY_TEST_OFFCHAIN=1 \ +LIVE_RELAY_TEST_RELAYS="wss://offchain.pub" \ +LIVE_RELAY_TEST_WRITE_VERIFY_READ=1 \ +LIVE_RELAY_TEST_TIMEOUT_SECONDS=8 \ +LIVE_RELAY_TEST_EXPECT_NO_FAILURES=1 \ +pnpm test test/live.offchain.test.ts +``` + +Optional knobs: + +- `LIVE_RELAY_TEST_SAMPLES` (default `12`) +- `LIVE_RELAY_TEST_SCRAPE_EVERY_MS` (default `1500`) +- `LIVE_RELAY_TEST_TIMEOUT_SECONDS` (default `8`) +- `LIVE_RELAY_TEST_RELAYS` (default `"wss://offchain.pub"`; comma-separated relay list) +- `LIVE_RELAY_TEST_WRITE_VERIFY_READ=1` to enable read-after-write verification (disabled by default) +- `LIVE_RELAY_TEST_EXPECT_NO_FAILURES=1` to make the test fail on any write-confirm/probe failures + ## Exposed metrics Relay-level labels use `{relay}` unless stated: diff --git a/package.json b/package.json index beef9cc..357637e 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,7 @@ "dev": "tsx watch src/index.ts", "build": "tsc -p tsconfig.json", "start": "node dist/index.js", - "test": "tsx --test test/config.test.ts test/smoke.test.ts" + "test": "tsx --test test/config.test.ts test/smoke.test.ts test/live.offchain.test.ts" }, "keywords": [ "nostr", diff --git a/src/config.ts b/src/config.ts index 6c57c7f..5beec2f 100644 --- a/src/config.ts +++ b/src/config.ts @@ -10,6 +10,7 @@ export interface RelayTarget { export interface WriteCheckConfig { enabled: boolean; + verifyRead: boolean; kind: number; privkey?: string; } @@ -34,6 +35,7 @@ const DEFAULT_LISTEN_ADDR = "0.0.0.0"; const DEFAULT_PORT = 9464; const DEFAULT_LOG_LEVEL: LogLevel = "info"; const DEFAULT_WRITE_CHECK_ENABLED = true; +const DEFAULT_WRITE_CHECK_VERIFY_READ = false; const DEFAULT_WRITE_CHECK_KIND = 30078; const MIN_TIMEOUT_SECONDS = 1; @@ -170,6 +172,10 @@ export function loadConfig(env: NodeJS.ProcessEnv = process.env): AppConfig { } const writeEnabledFlag = parseBoolean(env.WRITE_CHECK_ENABLED, DEFAULT_WRITE_CHECK_ENABLED); + const writeVerifyReadFlag = parseBoolean( + env.WRITE_CHECK_VERIFY_READ, + DEFAULT_WRITE_CHECK_VERIFY_READ, + ); const writeCheckKind = parseInteger( env.WRITE_CHECK_KIND, DEFAULT_WRITE_CHECK_KIND, @@ -200,6 +206,7 @@ export function loadConfig(env: NodeJS.ProcessEnv = process.env): AppConfig { const staleAfterMs = probeIntervalMs + probeTimeoutMs + 5_000; const writeCheck: WriteCheckConfig = { enabled: writeEnabledFlag, + verifyRead: writeVerifyReadFlag, kind: writeCheckKind, ...(resolvedWritePrivkey ? { privkey: resolvedWritePrivkey } : {}), }; diff --git a/src/prober.ts b/src/prober.ts index a05c8c8..f851d67 100644 --- a/src/prober.ts +++ b/src/prober.ts @@ -23,6 +23,7 @@ interface RelayCheckNode { interface NocapResult { open?: RelayCheckNode; read?: RelayCheckNode; + write?: RelayCheckNode; } interface RelayProbeState { @@ -192,8 +193,13 @@ export class RelayProber { const adapters = Object.values(NocapEveryAdapterDefault); await nocap.useAdapters(adapters); + const checks: string[] = + this.config.writeCheck.enabled && !this.config.writeCheck.verifyRead + ? ["open", "read", "write"] + : ["open", "read"]; + const result = (await withTimeout( - nocap.check(["open", "read"], true) as Promise, + nocap.check(checks, true) as Promise, this.config.probeTimeoutMs * 2, "nocap open/read", )) as NocapResult; @@ -211,27 +217,49 @@ export class RelayProber { } if (this.config.writeCheck.enabled) { - const privkey = this.config.writeCheck.privkey; - if (!privkey) { - writeConfirmOk = false; - this.metrics.incProbeError(relayLabel, "write_confirm"); - } else { - const writeResult = await runWriteConfirm({ - relay: relay.relay, - host: relay.host, - kind: this.config.writeCheck.kind, - privkey, - timeoutMs: this.config.probeTimeoutMs, - }); - writeConfirmOk = writeResult.ok; - writeDurationMs = durationOrDefault(writeResult.durationMs); - if (!writeResult.ok) { + if (!this.config.writeCheck.verifyRead) { + // Match nocap behavior: use websocket write check in the same nocap session. + const writeNode = result.write; + writeConfirmOk = + writeNode !== undefined && + writeNode.status !== "error" && + writeNode.data !== false; + writeDurationMs = durationOrDefault(result.write?.duration); + if (!writeConfirmOk) { this.metrics.incProbeError(relayLabel, "write_confirm"); this.logger.warn( - { relay: relayLabel, check: "write_confirm", reason: writeResult.reason }, + { + relay: relayLabel, + check: "write_confirm", + reason: result.write?.message ?? "nocap write check failed", + }, "write confirmation failed", ); } + } else { + const privkey = this.config.writeCheck.privkey; + if (!privkey) { + writeConfirmOk = false; + this.metrics.incProbeError(relayLabel, "write_confirm"); + } else { + const writeResult = await runWriteConfirm({ + relay: relay.relay, + host: relay.host, + kind: this.config.writeCheck.kind, + privkey, + verifyRead: this.config.writeCheck.verifyRead, + timeoutMs: this.config.probeTimeoutMs, + }); + writeConfirmOk = writeResult.ok; + writeDurationMs = durationOrDefault(writeResult.durationMs); + if (!writeResult.ok) { + this.metrics.incProbeError(relayLabel, "write_confirm"); + this.logger.warn( + { relay: relayLabel, check: "write_confirm", reason: writeResult.reason }, + "write confirmation failed", + ); + } + } } } diff --git a/src/writeConfirm.ts b/src/writeConfirm.ts index 01fe8a0..79c1afc 100644 --- a/src/writeConfirm.ts +++ b/src/writeConfirm.ts @@ -6,6 +6,7 @@ interface WriteConfirmInput { host: string; kind: number; privkey: string; + verifyRead: boolean; timeoutMs: number; } @@ -108,6 +109,13 @@ export async function runWriteConfirm(input: WriteConfirmInput): Promise { +test("loadConfig parses relay input and ignores invalid entries", () => { const config = loadConfig({ - RELAYS: "wss://relay.damus.io,not-a-url,wss://nos.lol", + RELAYS: "wss://offchain.pub,not-a-url,wss://offchain.pub", WRITE_CHECK_ENABLED: "false", }); - assert.equal(config.relays.length, 2); - assert.equal(config.relays[0]?.relay, "wss://relay.damus.io/"); - assert.equal(config.relays[1]?.relay, "wss://nos.lol/"); + assert.equal(config.relays.length, 1); + assert.equal(config.relays[0]?.relay, "wss://offchain.pub/"); assert.equal(config.writeCheck.enabled, false); assert.equal(config.probeIntervalMs, 0); assert.ok(config.warnings.some((w) => w.includes("Ignoring invalid relay URL"))); @@ -19,7 +18,7 @@ test("loadConfig parses valid relays and ignores invalid entries", () => { test("loadConfig generates write-check private key when missing", () => { const config = loadConfig({ - RELAYS: "wss://relay.damus.io", + RELAYS: "wss://offchain.pub", WRITE_CHECK_ENABLED: "true", }); @@ -33,7 +32,7 @@ test("loadConfig generates write-check private key when missing", () => { test("loadConfig generates write-check private key when configured key is invalid", () => { const config = loadConfig({ - RELAYS: "wss://relay.damus.io", + RELAYS: "wss://offchain.pub", WRITE_CHECK_ENABLED: "true", WRITE_CHECK_PRIVKEY: "not-a-valid-key", }); @@ -46,3 +45,16 @@ test("loadConfig generates write-check private key when configured key is invali "expected warning about invalid configured key", ); }); + +test("loadConfig sets write-check read verification flag", () => { + const defaultConfig = loadConfig({ + RELAYS: "wss://offchain.pub", + }); + assert.equal(defaultConfig.writeCheck.verifyRead, false); + + const verifyConfig = loadConfig({ + RELAYS: "wss://offchain.pub", + WRITE_CHECK_VERIFY_READ: "true", + }); + assert.equal(verifyConfig.writeCheck.verifyRead, true); +}); diff --git a/test/live.offchain.test.ts b/test/live.offchain.test.ts new file mode 100644 index 0000000..65aef01 --- /dev/null +++ b/test/live.offchain.test.ts @@ -0,0 +1,180 @@ +import test from "node:test"; +import assert from "node:assert/strict"; +import { spawn, type ChildProcessByStdio } from "node:child_process"; +import http from "node:http"; +import type { Readable } from "node:stream"; +import { setTimeout as sleep } from "node:timers/promises"; + +const ENABLED = process.env.LIVE_RELAY_TEST_OFFCHAIN === "1"; +const SCRAPE_SAMPLES = Number.parseInt(process.env.LIVE_RELAY_TEST_SAMPLES ?? "12", 10); +const SCRAPE_EVERY_MS = Number.parseInt(process.env.LIVE_RELAY_TEST_SCRAPE_EVERY_MS ?? "1500", 10); +const PROBE_TIMEOUT_SECONDS = Number.parseInt(process.env.LIVE_RELAY_TEST_TIMEOUT_SECONDS ?? "8", 10); +const WRITE_VERIFY_READ = process.env.LIVE_RELAY_TEST_WRITE_VERIFY_READ === "1"; +const ENFORCE_NO_WRITE_CONFIRM_FAILURES = process.env.LIVE_RELAY_TEST_EXPECT_NO_FAILURES === "1"; + +function randomPort(): number { + return 20_000 + Math.floor(Math.random() * 10_000); +} + +function httpGet( + port: number, + path: string, +): Promise<{ statusCode: number; body: string; headers: http.IncomingHttpHeaders }> { + return new Promise((resolve, reject) => { + const req = http.request( + { + hostname: "127.0.0.1", + port, + path, + method: "GET", + timeout: 5_000, + }, + (res) => { + const chunks: Buffer[] = []; + res.on("data", (chunk) => { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + }); + res.on("end", () => { + resolve({ + statusCode: res.statusCode ?? 0, + body: Buffer.concat(chunks).toString("utf8"), + headers: res.headers, + }); + }); + }, + ); + req.on("timeout", () => { + req.destroy(new Error(`timeout GET ${path}`)); + }); + req.on("error", reject); + req.end(); + }); +} + +type ExporterChild = ChildProcessByStdio; + +async function waitForStartup(child: ExporterChild, port: number, timeoutMs: number): Promise { + const started = Date.now(); + while (Date.now() - started < timeoutMs) { + if (child.exitCode !== null) { + throw new Error(`exporter exited early with code ${child.exitCode}`); + } + try { + const res = await httpGet(port, "/healthz"); + if ([200, 503].includes(res.statusCode)) return; + } catch { + // server not up yet + } + await sleep(150); + } + throw new Error("timeout waiting for exporter startup"); +} + +function metricValue(metrics: string, name: string, labels: Record): number | null { + const entries = Object.entries(labels); + const labelsPattern = entries + .map(([k, v]) => `${k}="${v.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}"`) + .join(","); + const linePattern = new RegExp(`^${name}\\{${labelsPattern}\\} (-?\\d+(?:\\.\\d+)?)$`, "m"); + const match = metrics.match(linePattern); + if (!match?.[1]) return null; + const parsed = Number.parseFloat(match[1]); + return Number.isFinite(parsed) ? parsed : null; +} + +test( + "live exporter probe diagnostic against offchain.pub", + { skip: !ENABLED }, + async (t) => { + const relayInput = process.env.LIVE_RELAY_TEST_RELAYS ?? "wss://offchain.pub"; + const relays = relayInput + .split(",") + .map((v) => v.trim()) + .filter((v) => v.length > 0); + assert.ok(relays.length > 0, "LIVE_RELAY_TEST_RELAYS must include at least one relay"); + + let relayLabel: string; + try { + relayLabel = new URL(relays[0]).toString(); + } catch { + assert.fail(`invalid first relay URL in LIVE_RELAY_TEST_RELAYS: ${relays[0]}`); + } + + const port = randomPort(); + const child = spawn("pnpm", ["exec", "tsx", "src/index.ts"], { + cwd: process.cwd(), + env: { + ...process.env, + RELAYS: relays.join(","), + PROBE_INTERVAL_SECONDS: "1", + PROBE_TIMEOUT_SECONDS: String(PROBE_TIMEOUT_SECONDS), + WRITE_CHECK_ENABLED: "true", + WRITE_CHECK_VERIFY_READ: WRITE_VERIFY_READ ? "true" : "false", + PORT: String(port), + LOG_LEVEL: "error", + }, + stdio: ["ignore", "pipe", "pipe"], + }); + + let stderr = ""; + child.stderr.on("data", (d: Buffer) => { + stderr += d.toString("utf8"); + }); + + t.after(() => { + if (child.exitCode === null) { + child.kill("SIGTERM"); + } + }); + + await waitForStartup(child, port, 10_000); + + let lastMetrics = ""; + for (let i = 0; i < SCRAPE_SAMPLES; i += 1) { + const metrics = await httpGet(port, "/metrics"); + assert.equal(metrics.statusCode, 200); + lastMetrics = metrics.body; + await sleep(SCRAPE_EVERY_MS); + } + + const writeConfirmOk = metricValue(lastMetrics, "nostr_relay_write_confirm_ok", { relay: relayLabel }); + const writeConfirmErrors = metricValue(lastMetrics, "nostr_relay_probe_errors_total", { + relay: relayLabel, + check: "write_confirm", + }); + const successRuns = metricValue(lastMetrics, "nostr_relay_probe_runs_total", { + relay: relayLabel, + result: "success", + }); + const failureRuns = metricValue(lastMetrics, "nostr_relay_probe_runs_total", { + relay: relayLabel, + result: "failure", + }); + + const diagnostic = { + relay: relayLabel, + relays, + scrapeSamples: SCRAPE_SAMPLES, + scrapeEveryMs: SCRAPE_EVERY_MS, + writeVerifyRead: WRITE_VERIFY_READ, + writeConfirmOk, + writeConfirmErrors, + successRuns, + failureRuns, + crashed: child.exitCode !== null, + }; + console.log("[live-offchain-exporter]", JSON.stringify(diagnostic)); + + assert.equal(child.exitCode, null, `exporter crashed unexpectedly: ${stderr}`); + assert.notEqual(writeConfirmOk, null, "missing nostr_relay_write_confirm_ok metric"); + assert.ok( + successRuns !== null || failureRuns !== null, + "missing nostr_relay_probe_runs_total metrics", + ); + + if (ENFORCE_NO_WRITE_CONFIRM_FAILURES) { + assert.equal(writeConfirmErrors ?? 0, 0, `write_confirm failures seen: ${JSON.stringify(diagnostic)}`); + assert.equal(failureRuns ?? 0, 0, `probe failures seen: ${JSON.stringify(diagnostic)}`); + } + }, +);