From 2ec7e1c3d9d7ce94c82b8ca09e01e2d3d14a83d0 Mon Sep 17 00:00:00 2001 From: Zhongxuan Wang Date: Mon, 29 Jun 2026 16:00:30 -0700 Subject: [PATCH] docs: document and stabilize LLM token and cost field semantics Add a canonical Token and Cost Field Semantics section to the provider response codecs page: a Usage/CostEstimate field reference, the per-provider token normalization table, granularity (per-call values vs the per-trajectory final_metrics aggregate), an exporter field-mapping table (ATOF/ATIF/OpenInference/OpenTelemetry), and a stability contract. Add brief field pointers and back-links on the OpenTelemetry, OpenInference, and ATIF exporter pages. Lock the contract with characterization tests: the OpenTelemetry LLM span emits cost only (no token attributes), Usage ignores unmodeled provider subfields, and OpenAIChatCodec drops completion_tokens_details. No runtime behavior change. Signed-off-by: Zhongxuan Wang --- .../tests/unit/codec/openai_chat_tests.rs | 25 ++++ .../core/tests/unit/codec/response_tests.rs | 14 +++ .../tests/unit/observability/otel_tests.rs | 37 ++++++ .../provider-response-codecs.mdx | 112 +++++++++++++++++- docs/observability-plugin/atif.mdx | 7 ++ docs/observability-plugin/openinference.mdx | 6 + docs/observability-plugin/opentelemetry.mdx | 6 + 7 files changed, 202 insertions(+), 5 deletions(-) diff --git a/crates/core/tests/unit/codec/openai_chat_tests.rs b/crates/core/tests/unit/codec/openai_chat_tests.rs index eb418a03d..ce66859ad 100644 --- a/crates/core/tests/unit/codec/openai_chat_tests.rs +++ b/crates/core/tests/unit/codec/openai_chat_tests.rs @@ -115,6 +115,31 @@ fn test_decode_response_cached_tokens() { assert_eq!(usage.cache_read_tokens, Some(42)); } +#[test] +fn test_decode_response_drops_completion_tokens_details() { + let codec = OpenAIChatCodec; + let response = json!({ + "id": "chatcmpl-reasoning", + "model": "gpt-4o", + "choices": [{ + "index": 0, + "message": { "role": "assistant", "content": "Hi" }, + "finish_reason": "stop" + }], + "usage": { + "prompt_tokens": 100, + "completion_tokens": 50, + "total_tokens": 150, + "completion_tokens_details": { "reasoning_tokens": 40 } + } + }); + let resp = codec.decode_response(&response).unwrap(); + assert_eq!(resp.usage.as_ref().unwrap().completion_tokens, Some(50)); + let serialized = serde_json::to_string(&resp).unwrap(); + assert!(!serialized.contains("completion_tokens_details")); + assert!(!serialized.contains("reasoning_tokens")); +} + #[test] fn test_decode_response_provider_reported_cost() { let codec = OpenAIChatCodec; diff --git a/crates/core/tests/unit/codec/response_tests.rs b/crates/core/tests/unit/codec/response_tests.rs index 396196bec..343357343 100644 --- a/crates/core/tests/unit/codec/response_tests.rs +++ b/crates/core/tests/unit/codec/response_tests.rs @@ -1457,6 +1457,20 @@ fn test_unknown_model_pricing_returns_none_without_blocking_usage() { assert_eq!(usage.prompt_tokens, Some(1_000)); } +#[test] +fn test_usage_ignores_unmodeled_provider_subfields() { + let usage: Usage = serde_json::from_value(json!({ + "prompt_tokens": 5, + "completion_tokens": 7, + "some_future_field": 99 + })) + .unwrap(); + assert_eq!(usage.prompt_tokens, Some(5)); + assert_eq!(usage.completion_tokens, Some(7)); + let reserialized = serde_json::to_value(&usage).unwrap(); + assert!(reserialized.get("some_future_field").is_none()); +} + // ------------------------------------------------------------------- // FinishReason serialization // ------------------------------------------------------------------- diff --git a/crates/core/tests/unit/observability/otel_tests.rs b/crates/core/tests/unit/observability/otel_tests.rs index 09d769ad4..1eeca4871 100644 --- a/crates/core/tests/unit/observability/otel_tests.rs +++ b/crates/core/tests/unit/observability/otel_tests.rs @@ -971,6 +971,43 @@ fn llm_end_with_unannotated_openai_response_uses_codec_cost() { ); } +#[test] +fn llm_end_emits_cost_only_no_token_or_gen_ai_attributes() { + let _pricing_guard = pricing_test_mutex().lock().unwrap(); + install_openai_disambiguation_pricing("priced-model"); + let _reset_guard = ResetPricingResolverGuard; + + let (provider, exporter) = make_provider(); + let mut processor = OtelEventProcessor::new(provider.clone(), "test-scope".to_string()); + let uuid = Uuid::now_v7(); + + processor.process(&make_start_event(uuid, None, "other", ScopeType::Llm, None)); + processor.process(&make_end_event( + uuid, + None, + "other", + ScopeType::Llm, + Some(openai_chat_provider_response("priced-model")), + )); + processor.force_flush().unwrap(); + + let spans = exporter.get_finished_spans().unwrap(); + assert_eq!(spans.len(), 1); + let keys: Vec = spans[0] + .attributes + .iter() + .map(|kv| kv.key.as_str().to_string()) + .collect(); + + assert!(keys.iter().any(|k| k == "nemo_relay.llm.cost.total")); + assert!(keys.iter().any(|k| k == "nemo_relay.llm.cost.currency")); + assert!( + keys.iter() + .all(|k| !k.to_ascii_lowercase().contains("token") && !k.starts_with("gen_ai")), + "no token attributes expected on the LLM span: {keys:?}" + ); +} + #[test] fn llm_end_with_unpriced_response_model_uses_requested_model_cost() { let _pricing_guard = pricing_test_mutex().lock().unwrap(); diff --git a/docs/integrate-into-frameworks/provider-response-codecs.mdx b/docs/integrate-into-frameworks/provider-response-codecs.mdx index e70ed3d78..71d764c5f 100644 --- a/docs/integrate-into-frameworks/provider-response-codecs.mdx +++ b/docs/integrate-into-frameworks/provider-response-codecs.mdx @@ -308,11 +308,113 @@ also enrich decoded custom response-codec output when the custom codec returns `model` and `usage` but omits `usage.cost`. Existing cost values are preserved, so provider-reported cost remains authoritative in the annotation. -Observability exporters prefer an explicit cost in the raw payload, then -normalized `Usage.cost`, then a derived estimate from model pricing. When cost is -available, ATIF step metrics and final metrics include `cost_usd`, -OpenInference includes the USD-denominated `llm.cost.total`, and OpenTelemetry -includes `nemo_relay.llm.cost.total` and `nemo_relay.llm.cost.currency`. +Observability exporters prefer codec-normalized usage and cost, then fall back to +raw payload fields and model-pricing estimates, subject to each exporter's +currency and reported-cost policy. When cost is available, each exporter projects +it per the [exporter field mapping](#exporter-field-mapping) below. + +## Token and Cost Field Semantics + +This section is the stable reference for the token and cost fields on LLM end +events, carried on `AnnotatedLlmResponse.usage`. + +### Granularity + +Every token and cost value is **per LLM call** (one provider completion) unless +it is an explicit aggregate: + +- `AnnotatedLlmResponse.usage` — the single LLM end event it is attached to. +- OpenTelemetry and OpenInference attributes — the single LLM span they appear on; + spans are never summed across calls. +- ATIF steps — an exported LLM call typically yields a `user` start step (no + metrics) and an `agent` end step that carries the call's `metrics`. +- ATIF `final_metrics.total_*` — the **only** aggregate: a per-trajectory sum of + the metric fields present on its steps. It excludes embedded subagent + trajectories and can be partial. + +No exporter emits a running cross-call total other than ATIF `final_metrics`. + +### Usage fields + +All normalized fields are optional. A provider may omit a field, while a codec can +compute one (such as Anthropic's `total_tokens`) and configured pricing can +synthesize `cost`. `Usage` has no catch-all field, so provider usage fields that +Relay does not model are dropped. + +| Field | Meaning | +|---|---| +| `prompt_tokens` | Input/prompt tokens. | +| `completion_tokens` | Output/completion tokens, passed through unmodified. For OpenAI Responses this is the provider's `output_tokens` (which per OpenAI already includes any reasoning tokens); the reasoning count is reported separately under `api_specific` and is not added on top. | +| `total_tokens` | Provider-reported, or computed as `prompt + completion` by some codecs (such as Anthropic) when the provider omits it. | +| `cache_read_tokens` | Prompt-cache read tokens, when the provider reports prompt caching. | +| `cache_write_tokens` | Prompt-cache write tokens (Anthropic-style providers). | +| `cost` | Normalized `CostEstimate`, when reported by the provider or estimable from configured pricing. | + +Built-in codecs normalize provider field names as follows: + +| Normalized field | OpenAI Chat | OpenAI Responses | Anthropic Messages | +|---|---|---|---| +| `prompt_tokens` | `prompt_tokens` | `input_tokens` | `input_tokens` | +| `completion_tokens` | `completion_tokens` | `output_tokens` | `output_tokens` | +| `total_tokens` | `total_tokens` | `total_tokens` | computed | +| `cache_read_tokens` | `prompt_tokens_details.cached_tokens` | `input_tokens_details.cached_tokens` | `cache_read_input_tokens` | +| `cache_write_tokens` | — | — | `cache_creation_input_tokens` | + +Built-in codecs preserve only modeled provider-specific usage details under +`api_specific`; other usage fields are dropped. For example, OpenAI Responses +reasoning token counts are kept under `api_specific` (`output_tokens_details`), +but OpenAI Chat `completion_tokens_details` is not. + +### Cost fields + +`CostEstimate` carries cost amounts (in `currency`) plus pricing provenance. See +[Cost Estimation](#cost-estimation) above for resolution order and pricing setup. + +| Field | Meaning | +|---|---| +| `total` | Optional total cost in `currency`. When absent, some exporters derive a total from the component amounts. | +| `currency` | ISO 4217 code; defaults to `USD`. | +| `input` / `output` / `cache_read` / `cache_write` | Per-category amounts in `currency`. | +| `source` | `provider_reported` (authoritative) or `model_pricing` (estimated). | +| `pricing_provider` / `pricing_model` / `pricing_as_of` / `pricing_source` | Estimate provenance, for auditing stale pricing. | + +Missing is not zero: an absent `cost` or token field means unknown, while an +explicit `0` is a reported value and is preserved. Relay does not convert +currencies. + +### Exporter field mapping + +Each exporter projects `usage`/`cost` differently; projections do not change the +canonical fields above. + +| | ATOF | ATIF step / `final_metrics` | OpenInference | OpenTelemetry | +|---|---|---|---|---| +| Prompt tokens | full `usage` preserved | `prompt_tokens` / `total_prompt_tokens` | `llm.token_count.prompt` | not emitted | +| Completion tokens | preserved | `completion_tokens` / `total_completion_tokens` | `llm.token_count.completion` | not emitted | +| Total tokens | preserved | no first-class field (see note) | `llm.token_count.total` | not emitted | +| Cache read / write | preserved | summed into `cached_tokens` / `total_cached_tokens` | `llm.token_count.prompt_details.cache_read` / `…cache_write` | not emitted | +| Cost | full `cost` preserved | `cost_usd` / `total_cost_usd` (USD only) | `llm.cost.total` (USD only) | `nemo_relay.llm.cost.total` + `nemo_relay.llm.cost.currency` (any currency) | + +OpenTelemetry carries cost in any currency; ATIF and OpenInference report cost +only when it is USD-denominated and otherwise omit it. ATIF derives metrics from +codec-normalized usage where available and fills missing supported fields from the +raw payload. `metrics.extra` holds only unmapped keys from the raw +`usage`/`token_usage` object (for example reasoning token counts, or a raw +`total_tokens`), and only when the step already has a recognized metric; +normalized-only or total-only values are not projected. + +### Stability + +The `Usage` and `CostEstimate` field names and meanings, and the exporter +mappings above, are stable as of ATOF `0.1` (ATIF schema `ATIF-v1.7`, pricing +catalog `version: 1`). New optional fields may be added to the serialized +JSON/ATOF shapes; renames or removals are breaking and called out in release +notes. (The Rust `Usage` and `CostEstimate` structs and the `CostSource` enum are +exhaustive, so adding a field or variant is a source-breaking change for Rust +consumers.) Behavior that is intentional in this release but may change later: +OpenTelemetry emits cost only, not token counts; ATIF and OpenInference report +cost only in USD; reasoning tokens are not a first-class `Usage` field; and +bindings expose `usage`/`cost` as snake_case JSON rather than typed objects. ## Built-in Response Codecs diff --git a/docs/observability-plugin/atif.mdx b/docs/observability-plugin/atif.mdx index f509a5274..7cce401f7 100644 --- a/docs/observability-plugin/atif.mdx +++ b/docs/observability-plugin/atif.mdx @@ -199,6 +199,13 @@ agent scope UUID. Each step's `extra.ancestry.function_id` is the event UUID, and `extra.ancestry.parent_id` is the parent event UUID. Trace spans expose the same values as `nemo_relay.uuid` and `nemo_relay.parent_uuid` attributes. +When present, a step's `metrics` can carry `prompt_tokens`, `completion_tokens`, +`cached_tokens` (cache read + write), and `cost_usd` (USD only); the trajectory +`final_metrics` sums the metrics present on its steps as `total_*`. See +[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics) +for the full mapping, including how ATIF sources these values from the codec +annotation and the raw payload. + ATIF is a trajectory projection over NeMo Relay events. It should preserve the meaning of scope parentage, event UUIDs, codec annotations, and exporter-local lineage rules without becoming the source of truth for runtime ownership, diff --git a/docs/observability-plugin/openinference.mdx b/docs/observability-plugin/openinference.mdx index 3bad14871..f3b0f221f 100644 --- a/docs/observability-plugin/openinference.mdx +++ b/docs/observability-plugin/openinference.mdx @@ -87,6 +87,12 @@ attributes. These values match ATIF `step.extra.ancestry.function_id` and the root agent span's `nemo_relay.uuid` also matches the ATIF `session_id`. Backend-native `trace_id` and `span_id` values are not written into ATIF. +LLM token counts appear as `llm.token_count.prompt`, `llm.token_count.completion`, +`llm.token_count.total`, and `llm.token_count.prompt_details.cache_read`/`cache_write`; +cost appears as USD-denominated `llm.cost.total`. See +[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics) +for the full mapping. + Redact sensitive event payloads with sanitize guardrails before production export. diff --git a/docs/observability-plugin/opentelemetry.mdx b/docs/observability-plugin/opentelemetry.mdx index c26ade52a..fd3405157 100644 --- a/docs/observability-plugin/opentelemetry.mdx +++ b/docs/observability-plugin/opentelemetry.mdx @@ -73,6 +73,12 @@ attributes. These values match ATIF `step.extra.ancestry.function_id` and the root agent span's `nemo_relay.uuid` also matches the ATIF `session_id`. Backend-native `trace_id` and `span_id` values are not written into ATIF. +For LLM end spans, cost is emitted as `nemo_relay.llm.cost.total` and +`nemo_relay.llm.cost.currency` (any currency). Token counts are not emitted as +discrete attributes. See +[Token and Cost Field Semantics](/integrate-into-frameworks/provider-response-codecs#token-and-cost-field-semantics) +for the full mapping. + Register the plugin before the first instrumented request, use stable service identity fields, keep credentials outside source code, and flush during graceful shutdown.