Skip to content

Commit c90424b

Browse files
committed
Add duration metric to tool calls
1 parent e86926d commit c90424b

4 files changed

Lines changed: 220 additions & 56 deletions

File tree

util/opentelemetry-util-genai/src/opentelemetry/util/genai/handler.py

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -137,33 +137,25 @@ def __init__(
137137
schema_url=schema_url,
138138
)
139139

140-
def _record_llm_metrics(
140+
def _record_metrics(
141141
self,
142-
invocation: LLMInvocation,
142+
invocation: GenAIInvocation,
143143
span: Span | None = None,
144144
*,
145145
error_type: str | None = None,
146146
) -> None:
147+
"""Record metrics for an invocation."""
147148
if self._metrics_recorder is None or span is None:
148149
return
150+
# Only LLMInvocation and ToolCall metrics are currently supported
151+
if not isinstance(invocation, (LLMInvocation, ToolCall)):
152+
return
149153
self._metrics_recorder.record(
150154
span,
151155
invocation,
152156
error_type=error_type,
153157
)
154158

155-
@staticmethod
156-
def _record_embedding_metrics(
157-
invocation: EmbeddingInvocation,
158-
span: Span | None = None,
159-
*,
160-
error_type: str | None = None,
161-
) -> None:
162-
# Metrics recorder currently supports LLMInvocation fields only.
163-
# Keep embedding metrics as a no-op until dedicated embedding
164-
# metric support is added.
165-
return
166-
167159
def _start(self, invocation: _T) -> _T:
168160
"""Start a GenAI invocation and create a pending span entry."""
169161
span_kind = SpanKind.CLIENT
@@ -203,13 +195,14 @@ def _stop(self, invocation: _T) -> _T:
203195
try:
204196
if isinstance(invocation, LLMInvocation):
205197
_apply_llm_finish_attributes(span, invocation)
206-
self._record_llm_metrics(invocation, span)
198+
self._record_metrics(invocation, span)
207199
_maybe_emit_llm_event(self._logger, span, invocation)
208200
elif isinstance(invocation, EmbeddingInvocation):
209201
_apply_embedding_finish_attributes(span, invocation)
210-
self._record_embedding_metrics(invocation, span)
202+
self._record_metrics(invocation, span)
211203
elif isinstance(invocation, ToolCall):
212204
_finish_tool_call_span(span, invocation, capture_content=True)
205+
self._record_metrics(invocation, span)
213206
finally:
214207
# Detach context and end span even if finishing fails
215208
otel_context.detach(invocation.context_token)
@@ -228,21 +221,18 @@ def _fail(self, invocation: _T, error: Error) -> _T:
228221
if isinstance(invocation, LLMInvocation):
229222
_apply_llm_finish_attributes(span, invocation)
230223
_apply_error_attributes(span, error, error_type)
231-
self._record_llm_metrics(
232-
invocation, span, error_type=error_type
233-
)
224+
self._record_metrics(invocation, span, error_type=error_type)
234225
_maybe_emit_llm_event(
235226
self._logger, span, invocation, error_type
236227
)
237228
elif isinstance(invocation, EmbeddingInvocation):
238229
_apply_embedding_finish_attributes(span, invocation)
239230
_apply_error_attributes(span, error, error_type)
240-
self._record_embedding_metrics(
241-
invocation, span, error_type=error_type
242-
)
231+
self._record_metrics(invocation, span, error_type=error_type)
243232
elif isinstance(invocation, ToolCall):
244233
invocation.error_type = error_type
245234
_finish_tool_call_span(span, invocation, capture_content=True)
235+
self._record_metrics(invocation, span, error_type=error_type)
246236
span.set_status(Status(StatusCode.ERROR, error.message))
247237
finally:
248238
# Detach context and end span even if finishing fails
Lines changed: 50 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
"""Helpers for emitting GenAI metrics from LLM invocations."""
1+
"""Helpers for emitting GenAI metrics from invocations."""
22

33
from __future__ import annotations
44

55
import timeit
6-
from typing import Dict, Optional
6+
from typing import Dict, Optional, Union
77

88
from opentelemetry.metrics import Histogram, Meter
99
from opentelemetry.semconv._incubating.attributes import (
@@ -18,7 +18,7 @@
1818
create_duration_histogram,
1919
create_token_histogram,
2020
)
21-
from opentelemetry.util.genai.types import LLMInvocation
21+
from opentelemetry.util.genai.types import LLMInvocation, ToolCall
2222
from opentelemetry.util.types import AttributeValue
2323

2424

@@ -32,44 +32,41 @@ def __init__(self, meter: Meter):
3232
def record(
3333
self,
3434
span: Optional[Span],
35-
invocation: LLMInvocation,
35+
invocation: Union[LLMInvocation, ToolCall],
3636
*,
3737
error_type: Optional[str] = None,
3838
) -> None:
39-
"""Record duration and token metrics for an invocation if possible."""
39+
"""Record duration and token metrics for an invocation.
4040
41+
Supports LLMInvocation (with token metrics) and ToolCall (duration only).
42+
"""
4143
# pylint: disable=too-many-branches
4244

4345
if span is None:
4446
return
4547

46-
token_counts: list[tuple[int, str]] = []
47-
if invocation.input_tokens is not None:
48-
token_counts.append(
49-
(
50-
invocation.input_tokens,
51-
GenAI.GenAiTokenTypeValues.INPUT.value,
52-
)
48+
# Build attributes based on invocation type
49+
attributes: Dict[str, AttributeValue] = {}
50+
51+
if isinstance(invocation, LLMInvocation):
52+
attributes[GenAI.GEN_AI_OPERATION_NAME] = (
53+
GenAI.GenAiOperationNameValues.CHAT.value
5354
)
54-
if invocation.output_tokens is not None:
55-
token_counts.append(
56-
(
57-
invocation.output_tokens,
58-
GenAI.GenAiTokenTypeValues.OUTPUT.value,
55+
if invocation.request_model:
56+
attributes[GenAI.GEN_AI_REQUEST_MODEL] = (
57+
invocation.request_model
5958
)
60-
)
59+
if invocation.response_model_name:
60+
attributes[GenAI.GEN_AI_RESPONSE_MODEL] = (
61+
invocation.response_model_name
62+
)
63+
else:
64+
# ToolCall
65+
attributes[GenAI.GEN_AI_OPERATION_NAME] = "execute_tool"
6166

62-
attributes: Dict[str, AttributeValue] = {
63-
GenAI.GEN_AI_OPERATION_NAME: GenAI.GenAiOperationNameValues.CHAT.value
64-
}
65-
if invocation.request_model:
66-
attributes[GenAI.GEN_AI_REQUEST_MODEL] = invocation.request_model
67+
# Common attributes across invocation types
6768
if invocation.provider:
6869
attributes[GenAI.GEN_AI_PROVIDER_NAME] = invocation.provider
69-
if invocation.response_model_name:
70-
attributes[GenAI.GEN_AI_RESPONSE_MODEL] = (
71-
invocation.response_model_name
72-
)
7370
if invocation.server_address:
7471
attributes[server_attributes.SERVER_ADDRESS] = (
7572
invocation.server_address
@@ -79,7 +76,7 @@ def record(
7976
if invocation.metric_attributes:
8077
attributes.update(invocation.metric_attributes)
8178

82-
# Calculate duration from span timing or invocation monotonic start
79+
# Calculate duration from monotonic start time
8380
duration_seconds: Optional[float] = None
8481
if invocation.monotonic_start_s is not None:
8582
duration_seconds = max(
@@ -98,12 +95,31 @@ def record(
9895
context=span_context,
9996
)
10097

101-
for token_count, token_type in token_counts:
102-
self._token_histogram.record(
103-
token_count,
104-
attributes=attributes | {GenAI.GEN_AI_TOKEN_TYPE: token_type},
105-
context=span_context,
106-
)
98+
# Token metrics only for LLMInvocation
99+
if isinstance(invocation, LLMInvocation):
100+
token_counts: list[tuple[int, str]] = []
101+
if invocation.input_tokens is not None:
102+
token_counts.append(
103+
(
104+
invocation.input_tokens,
105+
GenAI.GenAiTokenTypeValues.INPUT.value,
106+
)
107+
)
108+
if invocation.output_tokens is not None:
109+
token_counts.append(
110+
(
111+
invocation.output_tokens,
112+
GenAI.GenAiTokenTypeValues.OUTPUT.value,
113+
)
114+
)
115+
116+
for token_count, token_type in token_counts:
117+
self._token_histogram.record(
118+
token_count,
119+
attributes=attributes
120+
| {GenAI.GEN_AI_TOKEN_TYPE: token_type},
121+
context=span_context,
122+
)
107123

108124

109125
__all__ = ["InvocationMetricsRecorder"]

util/opentelemetry-util-genai/src/opentelemetry/util/genai/types.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,17 @@ class ToolCall(GenAIInvocation):
411411
# gen_ai.tool.call.result - Result returned by the tool (Opt-In, may contain sensitive data)
412412
tool_result: Any = None
413413

414+
# Metric-related fields (for gen_ai.client.operation.duration)
415+
provider: str | None = None # gen_ai.provider.name (Required for metrics)
416+
server_address: str | None = None # server.address (Recommended)
417+
server_port: int | None = None # server.port (Conditionally Required)
418+
metric_attributes: dict[str, Any] = field(
419+
default_factory=_new_str_any_dict
420+
)
421+
"""
422+
Additional attributes to set on metrics. Must be of low cardinality.
423+
"""
424+
414425
# Timing field (not inherited from GenAIInvocation, matches LLMInvocation pattern)
415426
monotonic_start_s: float | None = None
416427

util/opentelemetry-util-genai/tests/test_handler_metrics.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,3 +382,150 @@ class ToolExecutionError(RuntimeError):
382382
"ToolExecutionError",
383383
span.attributes[error_attributes.ERROR_TYPE],
384384
)
385+
386+
387+
class TelemetryHandlerToolMetricsTest(TestBase):
388+
"""Tests for tool call metrics recording"""
389+
390+
def _harvest_metrics(self) -> Dict[str, List[Any]]:
391+
"""Returns metrics_by_name mapping metric name to list of data points."""
392+
metrics = self.get_sorted_metrics(SCOPE)
393+
metrics_by_name: Dict[str, List[Any]] = {}
394+
for metric in metrics or []:
395+
points = metric.data.data_points or []
396+
metrics_by_name.setdefault(metric.name, []).extend(points)
397+
return metrics_by_name
398+
399+
def test_stop_tool_call_records_duration(self) -> None:
400+
"""Test stop records duration metric for tool call"""
401+
handler = TelemetryHandler(
402+
tracer_provider=self.tracer_provider,
403+
meter_provider=self.meter_provider,
404+
)
405+
tool = ToolCall(
406+
name="get_weather",
407+
arguments={"location": "Paris"},
408+
id="call_123",
409+
provider="test-provider",
410+
)
411+
412+
with patch("timeit.default_timer", return_value=1000.0):
413+
handler.start(tool)
414+
415+
with patch("timeit.default_timer", return_value=1002.5):
416+
handler.stop(tool)
417+
418+
metrics = self._harvest_metrics()
419+
self.assertIn("gen_ai.client.operation.duration", metrics)
420+
duration_points = metrics["gen_ai.client.operation.duration"]
421+
self.assertEqual(len(duration_points), 1)
422+
duration_point = duration_points[0]
423+
424+
# Check required attributes
425+
self.assertEqual(
426+
duration_point.attributes[GenAI.GEN_AI_OPERATION_NAME],
427+
"execute_tool",
428+
)
429+
self.assertEqual(
430+
duration_point.attributes[GenAI.GEN_AI_PROVIDER_NAME],
431+
"test-provider",
432+
)
433+
434+
# Check duration value
435+
self.assertAlmostEqual(duration_point.sum, 2.5, places=3)
436+
437+
# Token metrics should NOT be recorded for tool calls
438+
self.assertNotIn("gen_ai.client.token.usage", metrics)
439+
440+
def test_stop_tool_call_records_duration_with_server_address(self) -> None:
441+
"""Test stop records duration with server attributes for tool call"""
442+
handler = TelemetryHandler(
443+
tracer_provider=self.tracer_provider,
444+
meter_provider=self.meter_provider,
445+
)
446+
tool = ToolCall(
447+
name="api_call",
448+
arguments={},
449+
id="call_456",
450+
provider="custom-provider",
451+
server_address="api.example.com",
452+
server_port=443,
453+
)
454+
455+
with patch("timeit.default_timer", return_value=100.0):
456+
handler.start(tool)
457+
458+
with patch("timeit.default_timer", return_value=100.5):
459+
handler.stop(tool)
460+
461+
metrics = self._harvest_metrics()
462+
duration_points = metrics["gen_ai.client.operation.duration"]
463+
self.assertEqual(len(duration_points), 1)
464+
duration_point = duration_points[0]
465+
466+
self.assertEqual(
467+
duration_point.attributes["server.address"], "api.example.com"
468+
)
469+
self.assertEqual(duration_point.attributes["server.port"], 443)
470+
471+
def test_stop_tool_call_records_metric_attributes(self) -> None:
472+
"""Test stop includes custom metric_attributes for tool call"""
473+
handler = TelemetryHandler(
474+
tracer_provider=self.tracer_provider,
475+
meter_provider=self.meter_provider,
476+
)
477+
tool = ToolCall(
478+
name="custom_tool",
479+
arguments={},
480+
provider="my-provider",
481+
)
482+
tool.metric_attributes = {"custom.key": "custom_value"}
483+
484+
with patch("timeit.default_timer", return_value=0.0):
485+
handler.start(tool)
486+
487+
with patch("timeit.default_timer", return_value=1.0):
488+
handler.stop(tool)
489+
490+
metrics = self._harvest_metrics()
491+
duration_point = metrics["gen_ai.client.operation.duration"][0]
492+
493+
self.assertEqual(
494+
duration_point.attributes["custom.key"], "custom_value"
495+
)
496+
497+
def test_fail_tool_call_records_duration_with_error(self) -> None:
498+
"""Test fail records duration with error.type for tool call"""
499+
handler = TelemetryHandler(
500+
tracer_provider=self.tracer_provider,
501+
meter_provider=self.meter_provider,
502+
)
503+
tool = ToolCall(
504+
name="failing_tool",
505+
arguments={},
506+
id="call_err",
507+
provider="err-provider",
508+
)
509+
510+
with patch("timeit.default_timer", return_value=500.0):
511+
handler.start(tool)
512+
513+
error = Error(message="Tool execution failed", type=RuntimeError)
514+
with patch("timeit.default_timer", return_value=501.5):
515+
handler.fail(tool, error)
516+
517+
metrics = self._harvest_metrics()
518+
self.assertIn("gen_ai.client.operation.duration", metrics)
519+
duration_points = metrics["gen_ai.client.operation.duration"]
520+
self.assertEqual(len(duration_points), 1)
521+
duration_point = duration_points[0]
522+
523+
# Check error.type is recorded
524+
self.assertEqual(
525+
duration_point.attributes["error.type"], "RuntimeError"
526+
)
527+
self.assertEqual(
528+
duration_point.attributes[GenAI.GEN_AI_OPERATION_NAME],
529+
"execute_tool",
530+
)
531+
self.assertAlmostEqual(duration_point.sum, 1.5, places=3)

0 commit comments

Comments
 (0)