Skip to content

Commit 2997115

Browse files
bruno-garciaclaude
andauthored
feat: add Sentry Metrics API to scheduler jobs (#428)
* feat: add Sentry Metrics API to scheduler jobs Update Sentry .NET SDK from 6.0.0 to 6.2.0-alpha.0 to get the new Metrics API, then instrument all scheduler jobs and the download worker with counters, gauges, and distributions for monitoring in Sentry dashboards. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * ci: add Playwright workflow for .NET Blazor e2e tests The Angular-era playwright-workflow.yml was removed during the Blazor migration (#403). The new .NET-based PlaywrightTests use Testcontainers (PostgreSQL + ClickHouse) and boot the real app, so the workflow only needs .NET SDK and Docker (both available on ubuntu-latest). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: use IHub.Metrics instead of SentrySdk static, remove playwright workflow Use the DI-injected IHub instance for metrics emission instead of SentrySdk.Experimental.Metrics where IHub is available. DailyDownloadWorker keeps SentrySdk since it consistently uses the static API. Remove playwright-workflow.yml from this PR — will be addressed in a separate issue after fixing CI-specific test failures. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e8d0f6d commit 2997115

7 files changed

Lines changed: 67 additions & 1 deletion

src/Directory.Build.props

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
<GenerateDocumentationFile>true</GenerateDocumentationFile>
77
<NoWarn>$(NoWarn);1591</NoWarn>
88
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
9-
<SentryVersion>6.0.0</SentryVersion>
9+
<SentryVersion>6.2.0-alpha.0</SentryVersion>
1010
<!-- Version embedding: set via CI with -p:SourceRevisionId=<sha>, defaults to 'local' for dev -->
1111
<SourceRevisionId Condition="'$(SourceRevisionId)' == ''">local</SourceRevisionId>
1212
</PropertyGroup>

src/NuGetTrends.Scheduler/DailyDownloadPackageIdPublisher.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ public async Task Import(IJobCancellationToken token, PerformContext? context)
7575
logger.LogWarning("Job {JobId}: Skipping daily download publisher - another instance is already in progress", jobId);
7676
transaction.Finish(SpanStatus.Aborted);
7777
hub.CaptureCheckIn(JobScheduleConfig.DailyDownloadPublisher.MonitorSlug, CheckInStatus.Ok, checkInId); // Skipped is OK, not an error
78+
hub.Metrics.EmitCounter<int>("scheduler.job.skipped", 1,
79+
[new("job", "daily-download-publisher"), new("reason", "concurrent")]);
7880
throw new ConcurrentExecutionSkippedException(
7981
$"Job {jobId}: Daily download publisher skipped - another instance is already in progress");
8082
}
@@ -144,11 +146,18 @@ public async Task Import(IJobCancellationToken token, PerformContext? context)
144146
jobId, messageCount);
145147
}
146148

149+
hub.Metrics.EmitGauge<int>("scheduler.daily_download.packages_queued", messageCount,
150+
MeasurementUnit.None, [new("job", "daily-download-publisher")]);
151+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
152+
[new("job", "daily-download-publisher"), new("status", "ok")]);
153+
147154
transaction.Finish(SpanStatus.Ok);
148155
hub.CaptureCheckIn(JobScheduleConfig.DailyDownloadPublisher.MonitorSlug, CheckInStatus.Ok, checkInId);
149156
}
150157
catch (Exception e)
151158
{
159+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
160+
[new("job", "daily-download-publisher"), new("status", "error")]);
152161
transaction.Finish(e);
153162
hub.CaptureCheckIn(JobScheduleConfig.DailyDownloadPublisher.MonitorSlug, CheckInStatus.Error, checkInId);
154163
throw;

src/NuGetTrends.Scheduler/DailyDownloadWorker.cs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,10 @@ private async Task OnConsumerOnReceived(object sender, BasicDeliverEventArgs ea)
235235
var receiveTimestamp = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds();
236236
var latencyMs = receiveTimestamp - enqueuedTime.Value;
237237
queueProcessSpan.SetData("messaging.message.receive.latency", (double)latencyMs);
238+
239+
SentrySdk.Experimental.Metrics.EmitDistribution<double>("worker.queue_latency", (double)latencyMs,
240+
MeasurementUnit.Duration.Millisecond,
241+
[new("queue", queueName)]);
238242
}
239243

240244
List<string>? packageIds = null;
@@ -267,6 +271,12 @@ private async Task OnConsumerOnReceived(object sender, BasicDeliverEventArgs ea)
267271
}
268272

269273
consumer.Model.BasicAck(ea.DeliveryTag, false);
274+
275+
SentrySdk.Experimental.Metrics.EmitCounter<int>("worker.messages_processed", 1,
276+
[new("queue", queueName)]);
277+
SentrySdk.Experimental.Metrics.EmitDistribution<int>("worker.batch_size", packageIds.Count,
278+
MeasurementUnit.None, [new("queue", queueName)]);
279+
270280
queueProcessSpan.Finish(SpanStatus.Ok);
271281
transaction.Finish(SpanStatus.Ok);
272282
}
@@ -276,6 +286,8 @@ private async Task OnConsumerOnReceived(object sender, BasicDeliverEventArgs ea)
276286
// Don't report to Sentry as this is expected during outages
277287
_logger.LogWarning(e, "NuGet unavailable, requeueing batch of {Count} packages.", packageIds?.Count ?? 0);
278288
consumer.Model.BasicNack(ea.DeliveryTag, multiple: false, requeue: true);
289+
SentrySdk.Experimental.Metrics.EmitCounter<int>("worker.messages_requeued", 1,
290+
[new("queue", queueName), new("reason", "nuget_unavailable")]);
279291
queueProcessSpan.Finish(SpanStatus.Unavailable);
280292
transaction.Finish(SpanStatus.Unavailable);
281293
}
@@ -285,6 +297,8 @@ private async Task OnConsumerOnReceived(object sender, BasicDeliverEventArgs ea)
285297
// NACK the message so it gets redelivered later
286298
_logger.LogWarning(ae, "NuGet unavailable (multiple failures), requeueing batch of {Count} packages.", packageIds?.Count ?? 0);
287299
consumer.Model.BasicNack(ea.DeliveryTag, multiple: false, requeue: true);
300+
SentrySdk.Experimental.Metrics.EmitCounter<int>("worker.messages_requeued", 1,
301+
[new("queue", queueName), new("reason", "nuget_unavailable")]);
288302
queueProcessSpan.Finish(SpanStatus.Unavailable);
289303
transaction.Finish(SpanStatus.Unavailable);
290304
}
@@ -371,6 +385,12 @@ private async Task UpdateDownloadCount(IList<string> packageIds, ISpan parentSpa
371385
processDataSpan.SetData("deleted_packages", deletedPackageIds.Count);
372386
processDataSpan.Finish(SpanStatus.Ok);
373387

388+
SentrySdk.Experimental.Metrics.EmitCounter<int>("worker.packages_processed", clickHouseDownloads.Count);
389+
if (deletedPackageIds.Count > 0)
390+
{
391+
SentrySdk.Experimental.Metrics.EmitCounter<int>("worker.packages_deleted", deletedPackageIds.Count);
392+
}
393+
374394
using var scope = _services.CreateScope();
375395
await using var context = scope.ServiceProvider.GetRequiredService<NuGetTrendsContext>();
376396

src/NuGetTrends.Scheduler/NuGetCatalogImporter.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ public async Task Import(IJobCancellationToken token, PerformContext? context)
6565
_logger.LogWarning("Job {JobId}: Skipping catalog import - another import is already in progress", jobId);
6666
transaction.Finish(SpanStatus.Aborted);
6767
hub.CaptureCheckIn(JobScheduleConfig.CatalogImporter.MonitorSlug, CheckInStatus.Ok, checkInId); // Skipped is OK, not an error
68+
hub.Metrics.EmitCounter<int>("scheduler.job.skipped", 1,
69+
[new("job", "catalog-importer"), new("reason", "concurrent")]);
6870
throw new ConcurrentExecutionSkippedException(
6971
$"Job {jobId}: Catalog import skipped - another import is already in progress");
7072
}
@@ -79,6 +81,8 @@ public async Task Import(IJobCancellationToken token, PerformContext? context)
7981
jobId, availabilityState.UnavailableSince);
8082
transaction.Finish(SpanStatus.Unavailable);
8183
hub.CaptureCheckIn(JobScheduleConfig.CatalogImporter.MonitorSlug, CheckInStatus.Ok, checkInId); // Skipped is OK, not an error
84+
hub.Metrics.EmitCounter<int>("scheduler.job.skipped", 1,
85+
[new("job", "catalog-importer"), new("reason", "nuget_unavailable")]);
8286
return;
8387
}
8488

@@ -111,6 +115,9 @@ public async Task Import(IJobCancellationToken token, PerformContext? context)
111115
processingSpan.Finish(SpanStatus.Ok);
112116
transaction.Finish(SpanStatus.Ok);
113117
hub.CaptureCheckIn(JobScheduleConfig.CatalogImporter.MonitorSlug, CheckInStatus.Ok, checkInId);
118+
119+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
120+
[new("job", "catalog-importer"), new("status", "ok")]);
114121
}
115122
catch (HttpRequestException e)
116123
{
@@ -120,6 +127,8 @@ public async Task Import(IJobCancellationToken token, PerformContext? context)
120127
transaction.Finish(e);
121128
hub.CaptureException(e);
122129
hub.CaptureCheckIn(JobScheduleConfig.CatalogImporter.MonitorSlug, CheckInStatus.Error, checkInId);
130+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
131+
[new("job", "catalog-importer"), new("status", "error"), new("error_type", "http")]);
123132
throw;
124133
}
125134
catch (BrokenCircuitException e)
@@ -130,6 +139,8 @@ public async Task Import(IJobCancellationToken token, PerformContext? context)
130139
transaction.Finish(e);
131140
hub.CaptureException(e);
132141
hub.CaptureCheckIn(JobScheduleConfig.CatalogImporter.MonitorSlug, CheckInStatus.Error, checkInId);
142+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
143+
[new("job", "catalog-importer"), new("status", "error"), new("error_type", "circuit_breaker")]);
133144
throw;
134145
}
135146
catch (Exception e)
@@ -138,6 +149,8 @@ public async Task Import(IJobCancellationToken token, PerformContext? context)
138149
transaction.Finish(e);
139150
hub.CaptureException(e);
140151
hub.CaptureCheckIn(JobScheduleConfig.CatalogImporter.MonitorSlug, CheckInStatus.Error, checkInId);
152+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
153+
[new("job", "catalog-importer"), new("status", "error"), new("error_type", "unknown")]);
141154
throw;
142155
}
143156
}

src/NuGetTrends.Scheduler/NuGetTrends.Scheduler.csproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
<ImplicitUsings>true</ImplicitUsings>
66
<ContainerRepository>nugettrends/nugettrends.scheduler</ContainerRepository>
77
<ContainerWorkingDirectory>/App/</ContainerWorkingDirectory>
8+
<!-- Suppress experimental API warning for Sentry Metrics (IHub.Metrics) -->
9+
<NoWarn>$(NoWarn);SENTRYTRACECONNECTEDMETRICS</NoWarn>
810
</PropertyGroup>
911

1012
<ItemGroup>

src/NuGetTrends.Scheduler/TfmAdoptionSnapshotRefresher.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,19 +146,30 @@ public async Task Refresh(IJobCancellationToken token, PerformContext? context)
146146

147147
logger.LogInformation("Job {JobId}: TFM adoption snapshot refreshed with {Count} data points", jobId, insertCount);
148148

149+
hub.Metrics.EmitGauge<int>("scheduler.tfm_adoption.data_points", insertCount,
150+
MeasurementUnit.None, [new("job", "tfm-adoption"), new("mode", isBackfill ? "backfill" : "incremental")]);
151+
hub.Metrics.EmitGauge<int>("scheduler.tfm_adoption.tfm_month_combos", tfmPackages.Count,
152+
MeasurementUnit.None, [new("job", "tfm-adoption")]);
153+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
154+
[new("job", "tfm-adoption"), new("status", "ok")]);
155+
149156
transaction.Finish(SpanStatus.Ok);
150157
hub.CaptureCheckIn(JobScheduleConfig.TfmAdoptionRefresher.MonitorSlug, CheckInStatus.Ok, checkInId);
151158
}
152159
catch (OperationCanceledException)
153160
{
154161
logger.LogWarning("Job {JobId}: TFM adoption snapshot refresh was cancelled", jobId);
162+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
163+
[new("job", "tfm-adoption"), new("status", "cancelled")]);
155164
transaction.Finish(SpanStatus.Cancelled);
156165
hub.CaptureCheckIn(JobScheduleConfig.TfmAdoptionRefresher.MonitorSlug, CheckInStatus.Error, checkInId);
157166
throw;
158167
}
159168
catch (Exception ex)
160169
{
161170
logger.LogError(ex, "Job {JobId}: Failed to refresh TFM adoption snapshot", jobId);
171+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
172+
[new("job", "tfm-adoption"), new("status", "error")]);
162173
transaction.Finish(ex);
163174
hub.CaptureException(ex);
164175
hub.CaptureCheckIn(JobScheduleConfig.TfmAdoptionRefresher.MonitorSlug, CheckInStatus.Error, checkInId);

src/NuGetTrends.Scheduler/TrendingPackagesSnapshotRefresher.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,19 +134,30 @@ public async Task Refresh(IJobCancellationToken token, PerformContext? context)
134134

135135
logger.LogInformation("Job {JobId}: Trending packages snapshot refreshed with {Count} enriched packages", jobId, count);
136136

137+
hub.Metrics.EmitGauge<int>("scheduler.trending.packages_count", count,
138+
MeasurementUnit.None, [new("job", "trending-snapshot")]);
139+
hub.Metrics.EmitGauge<int>("scheduler.trending.new_packages_backfilled", newPackages,
140+
MeasurementUnit.None, [new("job", "trending-snapshot")]);
141+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
142+
[new("job", "trending-snapshot"), new("status", "ok")]);
143+
137144
transaction.Finish(SpanStatus.Ok);
138145
hub.CaptureCheckIn(JobScheduleConfig.TrendingSnapshotRefresher.MonitorSlug, CheckInStatus.Ok, checkInId);
139146
}
140147
catch (OperationCanceledException)
141148
{
142149
logger.LogWarning("Job {JobId}: Trending packages snapshot refresh was cancelled", jobId);
150+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
151+
[new("job", "trending-snapshot"), new("status", "cancelled")]);
143152
transaction.Finish(SpanStatus.Cancelled);
144153
hub.CaptureCheckIn(JobScheduleConfig.TrendingSnapshotRefresher.MonitorSlug, CheckInStatus.Error, checkInId);
145154
throw;
146155
}
147156
catch (Exception ex)
148157
{
149158
logger.LogError(ex, "Job {JobId}: Failed to refresh trending packages snapshot", jobId);
159+
hub.Metrics.EmitCounter<int>("scheduler.job.completed", 1,
160+
[new("job", "trending-snapshot"), new("status", "error")]);
150161
transaction.Finish(ex);
151162
hub.CaptureException(ex);
152163
hub.CaptureCheckIn(JobScheduleConfig.TrendingSnapshotRefresher.MonitorSlug, CheckInStatus.Error, checkInId);

0 commit comments

Comments
 (0)