From e4aa8aef7d975d6057aa620726ae9737b3acbf2d Mon Sep 17 00:00:00 2001 From: Theodore Li Date: Thu, 4 Jun 2026 18:44:37 -0700 Subject: [PATCH] fix(otel): make service.instance.id unique per process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All app replicas shared a hardcoded service.instance.id ("mothership-sim"), so OTel metrics from every process collapsed into one Prometheus series. Their independent cumulative counters then interleaved, producing phantom counter resets that corrupt rate()/increase() — staging hosted-key cost inflated to ~$0.72 from a few cents, while no-`key` metrics (cost_charged, throttled, queue_wait_*) were affected fleet-wide. Append the hostname (the container id under ECS, unique per task) so each replica gets its own series and sum(rate(...)) / sum(increase(...)) aggregate correctly. The mothership-sim prefix is kept so Jaeger's clock-skew adjuster still separates Sim from Go. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/sim/instrumentation-node.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/apps/sim/instrumentation-node.ts b/apps/sim/instrumentation-node.ts index fae2a3a265..e52eb884a9 100644 --- a/apps/sim/instrumentation-node.ts +++ b/apps/sim/instrumentation-node.ts @@ -2,6 +2,7 @@ // prefix (`sim-mothership:` / `go-mothership:`) to separate the two // halves of a mothership trace in the OTLP backend. +import { hostname } from 'node:os' import type { Attributes, Context, Link, SpanKind } from '@opentelemetry/api' import { DiagConsoleLogger, DiagLogLevel, diag, TraceFlags, trace } from '@opentelemetry/api' import type { @@ -259,10 +260,12 @@ async function initializeOpenTelemetry() { exportIntervalMillis: 60000, }) - // Unique instance id per origin keeps Jaeger's clock-skew adjuster - // from grouping Sim+Go spans together (they'd see multi-second - // drift as intra-service and emit spurious warnings). - const serviceInstanceId = `${telemetryConfig.serviceName}-${SERVICE_INSTANCE_SLUG}` + // Must be unique per process: replicas sharing one instance id collapse + // into a single Prometheus series, so their independent cumulative + // counters interleave and corrupt rate()/increase(). The slug keeps Sim + // distinct from Go for Jaeger's clock-skew grouping; the hostname (the + // container id under ECS) makes each replica its own series. + const serviceInstanceId = `${telemetryConfig.serviceName}-${SERVICE_INSTANCE_SLUG}-${hostname()}` const resource = defaultResource().merge( resourceFromAttributes({ [ATTR_SERVICE_NAME]: telemetryConfig.serviceName,