-
Notifications
You must be signed in to change notification settings - Fork 4k
Implement otel retry metrics #12064
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement otel retry metrics #12064
Changes from 5 commits
c3b473a
6c6a0f5
f9c5a68
bd69ed5
fab9a26
43a746b
ceb928a
4b9ff17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -71,6 +71,7 @@ | |
| */ | ||
| final class OpenTelemetryMetricsModule { | ||
| private static final Logger logger = Logger.getLogger(OpenTelemetryMetricsModule.class.getName()); | ||
| private static final double NANOS_PER_SEC = 1_000_000_000.0; | ||
|
ejona86 marked this conversation as resolved.
Outdated
|
||
| public static final ImmutableSet<String> DEFAULT_PER_CALL_METRICS_SET = | ||
| ImmutableSet.of( | ||
| "grpc.client.attempt.started", | ||
|
|
@@ -292,9 +293,12 @@ static final class CallAttemptsTracerFactory extends ClientStreamTracer.Factory | |
| private final String fullMethodName; | ||
| private final List<OpenTelemetryPlugin.ClientCallPlugin> callPlugins; | ||
| private Status status; | ||
| private long retryDelayNanos; | ||
| private long callLatencyNanos; | ||
| private final Object lock = new Object(); | ||
| private final AtomicLong attemptsPerCall = new AtomicLong(); | ||
| private final AtomicLong hedgedAttemptsPerCall = new AtomicLong(); | ||
| private final AtomicLong transparentRetriesPerCall = new AtomicLong(); | ||
| @GuardedBy("lock") | ||
| private int activeStreams; | ||
| @GuardedBy("lock") | ||
|
|
@@ -331,6 +335,7 @@ public ClientStreamTracer newClientStreamTracer(StreamInfo info, Metadata metada | |
| } | ||
| if (++activeStreams == 1 && attemptStopwatch.isRunning()) { | ||
| attemptStopwatch.stop(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This attemptStopwatch looks to be for
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CallAttemptsTracerFactory.attemptEnded is for recording a stream ended (StatsTraceContext.streamClosed calls it), so for the first attempt and each retry attempt end the attemptStopWatch is started, so it does measure the time between stream attempts, not per call. But there is a problem. RetriableStream creates its own anonymous tracerFactory and not take the one put by I have a question. A call can only have 1 stream, and the only way the activeStreams can be > 1 is if the tracer factory is shared between calls (and that would mess up the calculation for time between stream attempts anyway). The way the OpenTelemetryMetricsModule.MetricsClientInterceptor creates the factory though is per call, not shared between calls. So we we can never have > 1 streams and there should be no need for synchronization either.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What I said about RetriableStream's anonymous tracerFactory is not a problem, StatsTraceContext.streamClosed iterates over all tracers and calls stream closed on each of them. |
||
| retryDelayNanos = attemptStopwatch.elapsed(TimeUnit.NANOSECONDS); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't tell how
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its the same as with OpenCensus |
||
| } | ||
| } | ||
| // Skip recording for the first time, since it is already recorded in | ||
|
|
@@ -344,7 +349,11 @@ public ClientStreamTracer newClientStreamTracer(StreamInfo info, Metadata metada | |
| module.resource.clientAttemptCountCounter().add(1, attribute); | ||
| } | ||
| } | ||
| if (!info.isTransparentRetry()) { | ||
| if (info.isTransparentRetry()) { | ||
| transparentRetriesPerCall.incrementAndGet(); | ||
| } else if (info.isHedging()) { | ||
| hedgedAttemptsPerCall.incrementAndGet(); | ||
| } else { | ||
| attemptsPerCall.incrementAndGet(); | ||
|
Comment on lines
+351
to
356
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this not assume them to be mutually exclusive ? |
||
| } | ||
| return newClientTracer(info); | ||
|
|
@@ -407,14 +416,65 @@ void recordFinishedCall() { | |
| tracer.recordFinishedAttempt(); | ||
| } | ||
| callLatencyNanos = callStopWatch.elapsed(TimeUnit.NANOSECONDS); | ||
| io.opentelemetry.api.common.Attributes attribute = | ||
| io.opentelemetry.api.common.Attributes.of(METHOD_KEY, fullMethodName, | ||
| TARGET_KEY, target, | ||
| STATUS_KEY, status.getCode().toString()); | ||
|
|
||
| // Base attributes | ||
| io.opentelemetry.api.common.Attributes baseAttributes = | ||
| io.opentelemetry.api.common.Attributes.of( | ||
| METHOD_KEY, fullMethodName, | ||
| TARGET_KEY, target | ||
| ); | ||
|
|
||
| // Duration | ||
| if (module.resource.clientCallDurationCounter() != null) { | ||
| module.resource.clientCallDurationCounter() | ||
| .record(callLatencyNanos * SECONDS_PER_NANO, attribute); | ||
| module.resource.clientCallDurationCounter().record( | ||
| callLatencyNanos * SECONDS_PER_NANO, | ||
|
kannanjgithub marked this conversation as resolved.
|
||
| baseAttributes.toBuilder() | ||
| .put(STATUS_KEY, status.getCode().toString()) | ||
| .build() | ||
| ); | ||
| } | ||
|
|
||
| // Retry counts | ||
| if (module.resource.clientCallRetriesCounter() != null) { | ||
|
|
||
|
AgraVator marked this conversation as resolved.
Outdated
|
||
| long retriesPerCall = 0; | ||
| long attempts = attemptsPerCall.get(); | ||
| if (attempts > 0) { | ||
| retriesPerCall = attempts - 1; | ||
| } | ||
|
|
||
| if (retriesPerCall > 0) { | ||
| module.resource.clientCallRetriesCounter().record(retriesPerCall, baseAttributes); | ||
|
kannanjgithub marked this conversation as resolved.
|
||
| } | ||
| } | ||
|
|
||
| // Hedge counts | ||
| if (module.resource.clientCallHedgesCounter() != null) { | ||
|
|
||
| long hedgesPerCall = 0; | ||
| long attempts = hedgedAttemptsPerCall.get(); | ||
| if (attempts > 0) { | ||
| hedgesPerCall = attempts - 1; | ||
| } | ||
|
kannanjgithub marked this conversation as resolved.
Outdated
|
||
|
|
||
| if (hedgesPerCall > 0) { | ||
| module.resource.clientCallHedgesCounter().record(hedgesPerCall, baseAttributes); | ||
| } | ||
| } | ||
|
|
||
| // Transparent Retry counts | ||
| if (module.resource.clientCallTransparentRetriesCounter() != null | ||
| && transparentRetriesPerCall.get() > 0) { | ||
| module.resource.clientCallTransparentRetriesCounter().record( | ||
| transparentRetriesPerCall.get(), baseAttributes); | ||
| } | ||
|
|
||
| // Retry delay | ||
| if (module.resource.clientCallRetryDelayCounter() != null) { | ||
| module.resource.clientCallRetryDelayCounter().record( | ||
| retryDelayNanos / NANOS_PER_SEC, | ||
| baseAttributes | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.