From 2ec7e1c3d9d7ce94c82b8ca09e01e2d3d14a83d0 Mon Sep 17 00:00:00 2001
From: Zhongxuan Wang <daniewang@nvidia.com>
Date: Mon, 29 Jun 2026 16:00:30 -0700
Subject: [PATCH] docs: document and stabilize LLM token and cost field
 semantics

Add a canonical Token and Cost Field Semantics section to the provider response codecs page: a Usage/CostEstimate field reference, the per-provider token normalization table, granularity (per-call values vs the per-trajectory final_metrics aggregate), an exporter field-mapping table (ATOF/ATIF/OpenInference/OpenTelemetry), and a stability contract. Add brief field pointers and back-links on the OpenTelemetry, OpenInference, and ATIF exporter pages.

Lock the contract with characterization tests: the OpenTelemetry LLM span emits cost only (no token attributes), Usage ignores unmodeled provider subfields, and OpenAIChatCodec drops completion_tokens_details. No runtime behavior change.

Signed-off-by: Zhongxuan Wang <daniewang@nvidia.com>
---
 .../tests/unit/codec/openai_chat_tests.rs     |  25 ++++
 .../core/tests/unit/codec/response_tests.rs   |  14 +++
 .../tests/unit/observability/otel_tests.rs    |  37 ++++++
 .../provider-response-codecs.mdx              | 112 +++++++++++++++++-
 docs/observability-plugin/atif.mdx            |   7 ++
 docs/observability-plugin/openinference.mdx   |   6 +
 docs/observability-plugin/opentelemetry.mdx   |   6 +
 7 files changed, 202 insertions(+), 5 deletions(-)

diff --git a/crates/core/tests/unit/codec/openai_chat_tests.rs b/crates/core/tests/unit/codec/openai_chat_tests.rs
index eb418a03d..ce66859ad 100644
--- a/crates/core/tests/unit/codec/openai_chat_tests.rs
+++ b/crates/core/tests/unit/codec/openai_chat_tests.rs
@@ -115,6 +115,31 @@ fn test_decode_response_cached_tokens() {
     assert_eq!(usage.cache_read_tokens, Some(42));
 }
 
+#[test]
+fn test_decode_response_drops_completion_tokens_details() {
+    let codec = OpenAIChatCodec;
+    let response = json!({
+        "id": "chatcmpl-reasoning",
+        "model": "gpt-4o",
+        "choices": [{
+            "index": 0,
+            "message": { "role": "assistant", "content": "Hi" },
+            "finish_reason": "stop"
+        }],
+        "usage": {
+            "prompt_tokens": 100,
+            "completion_tokens": 50,
+            "total_tokens": 150,
+            "completion_tokens_details": { "reasoning_tokens": 40 }
+        }
+    });
+    let resp = codec.decode_response(&response).unwrap();
+    assert_eq!(resp.usage.as_ref().unwrap().completion_tokens, Some(50));
+    let serialized = serde_json::to_string(&resp).unwrap();
+    assert!(!serialized.contains("completion_tokens_details"));
+    assert!(!serialized.contains("reasoning_tokens"));
+}
+
 #[test]
 fn test_decode_response_provider_reported_cost() {
     let codec = OpenAIChatCodec;
diff --git a/crates/core/tests/unit/codec/response_tests.rs b/crates/core/tests/unit/codec/response_tests.rs
index 396196bec..343357343 100644
--- a/crates/core/tests/unit/codec/response_tests.rs
+++ b/crates/core/tests/unit/codec/response_tests.rs
@@ -1457,6 +1457,20 @@ fn test_unknown_model_pricing_returns_none_without_blocking_usage() {
     assert_eq!(usage.prompt_tokens, Some(1_000));
 }
 
+#[test]
+fn test_usage_ignores_unmodeled_provider_subfields() {
+    let usage: Usage = serde_json::from_value(json!({
+        "prompt_tokens": 5,
+        "completion_tokens": 7,
+        "some_future_field": 99
+    }))
+    .unwrap();
+    assert_eq!(usage.prompt_tokens, Some(5));
+    assert_eq!(usage.completion_tokens, Some(7));
+    let reserialized = serde_json::to_value(&usage).unwrap();
+    assert!(reserialized.get("some_future_field").is_none());
+}
+
 // -------------------------------------------------------------------
 // FinishReason serialization
 // -------------------------------------------------------------------
diff --git a/crates/core/tests/unit/observability/otel_tests.rs b/crates/core/tests/unit/observability/otel_tests.rs
index 09d769ad4..1eeca4871 100644
--- a/crates/core/tests/unit/observability/otel_tests.rs
+++ b/crates/core/tests/unit/observability/otel_tests.rs
@@ -971,6 +971,43 @@ fn llm_end_with_unannotated_openai_response_uses_codec_cost() {
     );
 }
 
+#[test]
+fn llm_end_emits_cost_only_no_token_or_gen_ai_attributes() {
+    let _pricing_guard = pricing_test_mutex().lock().unwrap();
+    install_openai_disambiguation_pricing("priced-model");
+    let _reset_guard = ResetPricingResolverGuard;
+
+    let (provider, exporter) = make_provider();
+    let mut processor = OtelEventProcessor::new(provider.clone(), "test-scope".to_string());
+    let uuid = Uuid::now_v7();
+
+    processor.process(&make_start_event(uuid, None, "other", ScopeType::Llm, None));
+    processor.process(&make_end_event(
+        uuid,
+        None,
+        "other",
+        ScopeType::Llm,
+        Some(openai_chat_provider_response("priced-model")),
+    ));
+    processor.force_flush().unwrap();
+
+    let spans = exporter.get_finished_spans().unwrap();
+    assert_eq!(spans.len(), 1);
+    let keys: Vec<String> = spans[0]
+        .attributes
+        .iter()
+        .map(|kv| kv.key.as_str().to_string())
+        .collect();
+
+    assert!(keys.iter().any(|k| k == "nemo_relay.llm.cost.total"));
+    assert!(keys.iter().any(|k| k == "nemo_relay.llm.cost.currency"));
+    assert!(
+        keys.iter()
+            .all(|k| !k.to_ascii_lowercase().contains("token") && !k.starts_with("gen_ai")),
+        "no token attributes expected on the LLM span: {keys:?}"
+    );
+}
+
 #[test]
 fn llm_end_with_unpriced_response_model_uses_requested_model_cost() {
     let _pricing_guard = pricing_test_mutex().lock().unwrap();
diff --git a/docs/integrate-into-frameworks/provider-response-codecs.mdx b/docs/integrate-into-frameworks/provider-response-codecs.mdx
index e70ed3d78..71d764c5f 100644
--- a/docs/integrate-into-frameworks/provider-response-codecs.mdx
+++ b/docs/integrate-into-frameworks/provider-response-codecs.mdx
@@ -308,11 +308,113 @@ also enrich decoded custom response-codec output when the custom codec returns
 `model` and `usage` but omits `usage.cost`. Existing cost values are preserved,
 so provider-reported cost remains authoritative in the annotation.
 
-Observability exporters prefer an explicit cost in the raw payload, then
-normalized `Usage.cost`, then a derived estimate from model pricing. When cost is
-available, ATIF step metrics and final metrics include `cost_usd`,
-OpenInference includes the USD-denominated `llm.cost.total`, and OpenTelemetry
-includes `nemo_relay.llm.cost.total` and `nemo_relay.llm.cost.currency`.
+Observability exporters prefer codec-normalized usage and cost, then fall back to
+raw payload fields and model-pricing estimates, subject to each exporter's
+currency and reported-cost policy. When cost is available, each exporter projects
+it per the [exporter field mapping](#exporter-field-mapping) below.
+
+## Token and Cost Field Semantics
+
+This section is the stable reference for the token and cost fields on LLM end
+events, carried on `AnnotatedLlmResponse.usage`.
+
+### Granularity
+
+Every token and cost value is **per LLM call** (one provider completion) unless
+it is an explicit aggregate:
+
+- `AnnotatedLlmResponse.usage` — the single LLM end event it is attached to.
+- OpenTelemetry and OpenInference attributes — the single LLM span they appear on;
+  spans are never summed across calls.
+- ATIF steps — an exported LLM call typically yields a `user` start step (no
+  metrics) and an `agent` end step that carries the call's `metrics`.
+- ATIF `final_metrics.total_*` — the **only** aggregate: a per-trajectory sum of
+  the metric fields present on its steps. It excludes embedded subagent
+  trajectories and can be partial.
+
+No exporter emits a running cross-call total other than ATIF `final_metrics`.
+
+### Usage fields
+
+All normalized fields are optional. A provider may omit a field, while a codec can
+compute one (such as Anthropic's `total_tokens`) and configured pricing can
+synthesize `cost`. `Usage` has no catch-all field, so provider usage fields that
+Relay does not model are dropped.
+
+| Field | Meaning |
+|---|---|
+| `prompt_tokens` | Input/prompt tokens. |
+| `completion_tokens` | Output/completion tokens, passed through unmodified. For OpenAI Responses this is the provider's `output_tokens` (which per OpenAI already includes any reasoning tokens); the reasoning count is reported separately under `api_specific` and is not added on top. |
+| `total_tokens` | Provider-reported, or computed as `prompt + completion` by some codecs (such as Anthropic) when the provider omits it. |
+| `cache_read_tokens` | Prompt-cache read tokens, when the provider reports prompt caching. |
+| `cache_write_tokens` | Prompt-cache write tokens (Anthropic-style providers). |
+| `cost` | Normalized `CostEstimate`, when reported by the provider or estimable from configured pricing. |
+
+Built-in codecs normalize provider field names as follows:
+
+| Normalized field | OpenAI Chat | OpenAI Responses | Anthropic Messages |
+|---|---|---|---|
+| `prompt_tokens` | `prompt_tokens` | `input_tokens` | `input_tokens` |
+| `completion_tokens` | `completion_tokens` | `output_tokens` | `output_tokens` |
+| `total_tokens` | `total_tokens` | `total_tokens` | computed |
+| `cache_read_tokens` | `prompt_tokens_details.cached_tokens` | `input_tokens_details.cached_tokens` | `cache_read_input_tokens` |
+| `cache_write_tokens` | — | — | `cache_creation_input_tokens` |
+
+Built-in codecs preserve only modeled provider-specific usage details under
+`api_specific`; other usage fields are dropped. For example, OpenAI Responses
+reasoning token counts are kept under `api_specific` (`output_tokens_details`),
+but OpenAI Chat `completion_tokens_details` is not.
+
+### Cost fields
+
+`CostEstimate` carries cost amounts (in `currency`) plus pricing provenance. See
+[Cost Estimation](#cost-estimation) above for resolution order and pricing setup.
+
+| Field | Meaning |
+|---|---|
+| `total` | Optional total cost in `currency`. When absent, some exporters derive a total from the component amounts. |
+| `currency` | ISO 4217 code; defaults to `USD`. |
+| `input` / `output` / `cache_read` / `cache_write` | Per-category amounts in `currency`. |
+| `source` | `provider_reported` (authoritative) or `model_pricing` (estimated). |
+| `pricing_provider` / `pricing_model` / `pricing_as_of` / `pricing_source` | Estimate provenance, for auditing stale pricing. |
+
+Missing is not zero: an absent `cost` or token field means unknown, while an
+explicit `0` is a reported value and is preserved. Relay does not convert
+currencies.
+
+### Exporter field mapping
+
+Each exporter projects `usage`/`cost` differently; projections do not change the
+canonical fields above.
+
+| | ATOF | ATIF step / `final_metrics` | OpenInference | OpenTelemetry |
+|---|---|---|---|---|
+| Prompt tokens | full `usage` preserved | `prompt_tokens` / `total_prompt_tokens` | `llm.token_count.prompt` | not emitted |
+| Completion tokens | preserved | `completion_tokens` / `total_completion_tokens` | `llm.token_count.completion` | not emitted |
+| Total tokens | preserved | no first-class field (see note) | `llm.token_count.total` | not emitted |
+| Cache read / write | preserved | summed into `cached_tokens` / `total_cached_tokens` | `llm.token_count.prompt_details.cache_read` / `…cache_write` | not emitted |
+| Cost | full `cost` preserved | `cost_usd` / `total_cost_usd` (USD only) | `llm.cost.total` (USD only) | `nemo_relay.llm.cost.total` + `nemo_relay.llm.cost.currency` (any currency) |
+
+OpenTelemetry carries cost in any currency; ATIF and OpenInference report cost
+only when it is USD-denominated and otherwise omit it. ATIF derives metrics from
+codec-normalized usage where available and fills missing supported fields from the
+raw payload. `metrics.extra` holds only unmapped keys from the raw
+`usage`/`token_usage` object (for example reasoning token counts, or a raw
+`total_tokens`), and only when the step already has a recognized metric;
+normalized-only or total-only values are not projected.
+
+### Stability
+
+The `Usage` and `CostEstimate` field names and meanings, and the exporter
+mappings above, are stable as of ATOF `0.1` (ATIF schema `ATIF-v1.7`, pricing
+catalog `version: 1`). New optional fields may be added to the serialized
+JSON/ATOF shapes; renames or removals are breaking and called out in release
+notes. (The Rust `Usage` and `CostEstimate` structs and the `CostSource` enum are
+exhaustive, so adding a field or variant is a source-breaking change for Rust
+consumers.) Behavior that is intentional in this release but may change later:
+OpenTelemetry emits cost only, not token counts; ATIF and OpenInference report
+cost only in USD; reasoning tokens are not a first-class `Usage` field; and
+bindings expose `usage`/`cost` as snake_case JSON rather than typed objects.
 
 ## Built-in Response Codecs
 
diff --git a/docs/observability-plugin/atif.mdx b/docs/observability-plugin/atif.mdx
index f509a5274..7cce401f7 100644
--- a/docs/observability-plugin/atif.mdx
+++ b/docs/observability-plugin/atif.mdx
@@ -199,6 +199,13 @@ agent scope UUID. Each step's `extra.ancestry.function_id` is the event UUID,
 and `extra.ancestry.parent_id` is the parent event UUID. Trace spans expose the
 same values as `nemo_relay.uuid` and `nemo_relay.parent_uuid` attributes.
 
+When present, a step's `metrics` can carry `prompt_tokens`, `completion_tokens`,
+`cached_tokens` (cache read + write), and `cost_usd` (USD only); the trajectory
+`final_metrics` sums the metrics present on its steps as `total_*`. See
+[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics)
+for the full mapping, including how ATIF sources these values from the codec
+annotation and the raw payload.
+
 ATIF is a trajectory projection over NeMo Relay events. It should preserve the
 meaning of scope parentage, event UUIDs, codec annotations, and exporter-local
 lineage rules without becoming the source of truth for runtime ownership,
diff --git a/docs/observability-plugin/openinference.mdx b/docs/observability-plugin/openinference.mdx
index 3bad14871..f3b0f221f 100644
--- a/docs/observability-plugin/openinference.mdx
+++ b/docs/observability-plugin/openinference.mdx
@@ -87,6 +87,12 @@ attributes. These values match ATIF `step.extra.ancestry.function_id` and
 the root agent span's `nemo_relay.uuid` also matches the ATIF `session_id`.
 Backend-native `trace_id` and `span_id` values are not written into ATIF.
 
+LLM token counts appear as `llm.token_count.prompt`, `llm.token_count.completion`,
+`llm.token_count.total`, and `llm.token_count.prompt_details.cache_read`/`cache_write`;
+cost appears as USD-denominated `llm.cost.total`. See
+[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics)
+for the full mapping.
+
 Redact sensitive event payloads with sanitize guardrails before production
 export.
 
diff --git a/docs/observability-plugin/opentelemetry.mdx b/docs/observability-plugin/opentelemetry.mdx
index c26ade52a..fd3405157 100644
--- a/docs/observability-plugin/opentelemetry.mdx
+++ b/docs/observability-plugin/opentelemetry.mdx
@@ -73,6 +73,12 @@ attributes. These values match ATIF `step.extra.ancestry.function_id` and
 the root agent span's `nemo_relay.uuid` also matches the ATIF `session_id`.
 Backend-native `trace_id` and `span_id` values are not written into ATIF.
 
+For LLM end spans, cost is emitted as `nemo_relay.llm.cost.total` and
+`nemo_relay.llm.cost.currency` (any currency). Token counts are not emitted as
+discrete attributes. See
+[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics)
+for the full mapping.
+
 Register the plugin before the first instrumented request, use stable service
 identity fields, keep credentials outside source code, and flush during
 graceful shutdown.