Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
de02e51
OTEL-2540 Add SpanContext to persistent queue
jackgopack4 May 6, 2025
27cad76
OTEL-2540 add changelog
jackgopack4 May 6, 2025
bce7256
OTEL-2540 add marshalRequestWithSpanContext and unmarshalRequestWithS…
jackgopack4 May 7, 2025
e2ee866
add test coverage
jackgopack4 May 7, 2025
9ac33b5
add persistentqueue benchmark test OTEL-2540
jackgopack4 May 9, 2025
c8be153
switch marshal from JSON to byte-based
jackgopack4 May 9, 2025
e04b5c6
switch approach to make multiple storage operations OTEL-2540
jackgopack4 May 9, 2025
8685a6e
remove inadvertent debug code
jackgopack4 May 9, 2025
fdeba3a
add test coverage OTEL-2540
jackgopack4 May 12, 2025
dba9427
add exporter.PersistSpanContext featuregate OTEL-2540
jackgopack4 May 13, 2025
fe26a0c
add comments OTEL-2540
jackgopack4 May 13, 2025
94df316
add test coverage OTEL-2540
jackgopack4 May 14, 2025
9592b65
rename spanContextWrapper and spanContext objects
jackgopack4 May 15, 2025
e934aa0
change approach to local spancontext
jackgopack4 May 19, 2025
bbd9198
OTEL-2540 unit tests
jackgopack4 May 19, 2025
b1b9771
add unit test coverage OTEL-2540
jackgopack4 May 20, 2025
2940da2
apply suggestions from code review
jackgopack4 May 21, 2025
dae829e
remove unnecessary helper function
jackgopack4 May 22, 2025
cf73142
fix linter and create persistent_queue_context
jackgopack4 Jun 3, 2025
34b6200
Merge branch 'main' into jackgopack4/save-span-links-persistentqueue
mx-psi Jun 4, 2025
b1292da
Merge branch 'main' into jackgopack4/save-span-links-persistentqueue
jackgopack4 Jun 4, 2025
b8b78a9
Merge branch 'main' into jackgopack4/save-span-links-persistentqueue
jackgopack4 Jun 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .chloggen/jackgopack4-add-spancontext-persistentqueue.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: 'enhancement'

# The name of the component, or a single word describing the area of concern, (e.g. otlpreceiver)
component: 'exporterhelper'

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: "Add `exporter.PropagateSpanContext` to enable propagating SpanContext along with telemetry requests in the persistent queue"

# One or more tracking issues or pull requests related to the change
issues: [11740, 12212, 12934]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: |
This change will allow internal telemetry spans to be processed when using persistent queue/storage.
When enabled, requests will use approximately 128 bytes more in persistent storage.

# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: [user]
132 changes: 99 additions & 33 deletions exporter/exporterhelper/internal/queuebatch/persistent_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import (
"context"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"strconv"
Expand Down Expand Up @@ -253,17 +254,25 @@
return err
}
}
// Operations will include item and write index (and context if spancontext feature enabled)
ops := make([]*storage.Operation, 2, 3)
ops[0] = storage.SetOperation(writeIndexKey, itemIndexToBytes(pq.metadata.WriteIndex+1))

reqBuf, err := pq.set.encoding.Marshal(req)
if err != nil {
return err
}
ops[1] = storage.SetOperation(getItemKey(pq.metadata.WriteIndex), reqBuf)

// Carry out a transaction where we both add the item and update the write index
ops := []*storage.Operation{
storage.SetOperation(writeIndexKey, itemIndexToBytes(pq.metadata.WriteIndex+1)),
storage.SetOperation(getItemKey(pq.metadata.WriteIndex), reqBuf),
if persistRequestContextFeatureGate.IsEnabled() {
contextBuf, scErr := getAndMarshalSpanContext(ctx)
if scErr != nil {
return scErr
}

Check warning on line 271 in exporter/exporterhelper/internal/queuebatch/persistent_queue.go

View check run for this annotation

Codecov / codecov/patch

exporter/exporterhelper/internal/queuebatch/persistent_queue.go#L270-L271

Added lines #L270 - L271 were not covered by tests
ops = append(ops, storage.SetOperation(getContextKey(pq.metadata.WriteIndex), contextBuf))
}

// Carry out a transaction where we add the item/context and update the write index
if err = pq.client.Batch(ctx, ops...); err != nil {
return err
}
Expand Down Expand Up @@ -295,7 +304,13 @@

// Read until either a successful retrieved element or no more elements in the storage.
for pq.metadata.ReadIndex != pq.metadata.WriteIndex {
index, req, consumed := pq.getNextItem(ctx)
index, req, consumed, restoredContext, err := pq.getNextItem(ctx)
if err != nil {
pq.logger.Debug("Failed to dispatch item", zap.Error(err))
if err = pq.itemDispatchingFinish(ctx, index); err != nil {
pq.logger.Error("Error deleting item from queue", zap.Error(err))
}
}
// Ensure the used size and the channel size are in sync.
if pq.metadata.ReadIndex == pq.metadata.WriteIndex {
pq.metadata.QueueSize = 0
Expand All @@ -304,7 +319,7 @@
if consumed {
id := indexDonePool.Get().(*indexDone)
id.reset(index, pq.set.sizer.Sizeof(req), pq)
return context.Background(), req, id, true
return restoredContext, req, id, true
}
}

Expand All @@ -317,37 +332,52 @@
// getNextItem pulls the next available item from the persistent storage along with its index. Once processing is
// finished, the index should be called with onDone to clean up the storage. If no new item is available,
// returns false.
func (pq *persistentQueue[T]) getNextItem(ctx context.Context) (uint64, T, bool) {
func (pq *persistentQueue[T]) getNextItem(ctx context.Context) (uint64, T, bool, context.Context, error) {
index := pq.metadata.ReadIndex
// Increase here, so even if errors happen below, it always iterates
pq.metadata.ReadIndex++
pq.metadata.CurrentlyDispatchedItems = append(pq.metadata.CurrentlyDispatchedItems, index)
getOp := storage.GetOperation(getItemKey(index))
err := pq.client.Batch(ctx,
storage.SetOperation(readIndexKey, itemIndexToBytes(pq.metadata.ReadIndex)),
storage.SetOperation(currentlyDispatchedItemsKey, itemIndexArrayToBytes(pq.metadata.CurrentlyDispatchedItems)),
getOp)
ops := make([]*storage.Operation, 3, 4)
ops[0] = storage.SetOperation(readIndexKey, itemIndexToBytes(pq.metadata.ReadIndex))
ops[1] = storage.SetOperation(currentlyDispatchedItemsKey, itemIndexArrayToBytes(pq.metadata.CurrentlyDispatchedItems))
ops[2] = getOp

var request T
if err == nil {
request, err = pq.set.encoding.Unmarshal(getOp.Value)
// Only add context operation if feature gate is enabled
var ctxOp *storage.Operation
if persistRequestContextFeatureGate.IsEnabled() {
ctxOp = storage.GetOperation(getContextKey(index))
ops = append(ops, ctxOp)
Comment thread
dmitryax marked this conversation as resolved.
}

var request T
restoredContext := context.Background()
err := pq.client.Batch(ctx, ops...)
if err != nil {
pq.logger.Debug("Failed to dispatch item", zap.Error(err))
// We need to make sure that currently dispatched items list is cleaned
if err = pq.itemDispatchingFinish(ctx, index); err != nil {
pq.logger.Error("Error deleting item from queue", zap.Error(err))
}
return 0, request, false, restoredContext, err
}
request, err = pq.set.encoding.Unmarshal(getOp.Value)
if err != nil {
return 0, request, false, ctx, err
}

return 0, request, false
// Only try to restore context if feature gate is enabled
if persistRequestContextFeatureGate.IsEnabled() {
var rc requestContext
if ctxOp.Value != nil {
unmarshalErr := json.Unmarshal(ctxOp.Value, &rc)
if unmarshalErr != nil {
return 0, request, false, ctx, unmarshalErr
}
restoredContext = contextWithLocalSpanContext(restoredContext, rc.SpanContext)
}
}

// Increase the reference count, so the client is not closed while the request is being processed.
// The client cannot be closed because we hold the lock since last we checked `stopped`.
pq.refClient++

return index, request, true
return index, request, true, restoredContext, nil
}

// onDone should be called to remove the item of the given index from the queue once processing is finished.
Expand Down Expand Up @@ -414,13 +444,29 @@

pq.logger.Info("Fetching items left for dispatch by consumers", zap.Int(zapNumberOfItems,
len(dispatchedItems)))
retrieveBatch := make([]*storage.Operation, len(dispatchedItems))
cleanupBatch := make([]*storage.Operation, len(dispatchedItems))

// Calculate batch sizes based on whether context persistence is enabled
batchSize := len(dispatchedItems)
if persistRequestContextFeatureGate.IsEnabled() {
batchSize *= 2
}

retrieveBatch := make([]*storage.Operation, batchSize)
cleanupBatch := make([]*storage.Operation, batchSize)

for i, it := range dispatchedItems {
key := getItemKey(it)
retrieveBatch[i] = storage.GetOperation(key)
cleanupBatch[i] = storage.DeleteOperation(key)
reqKey := getItemKey(it)
retrieveBatch[i] = storage.GetOperation(reqKey)
cleanupBatch[i] = storage.DeleteOperation(reqKey)

if persistRequestContextFeatureGate.IsEnabled() {
// store the context keys at at the end of the batch
ctxKey := getContextKey(it)
retrieveBatch[i+len(dispatchedItems)] = storage.GetOperation(ctxKey)
cleanupBatch[i+len(dispatchedItems)] = storage.DeleteOperation(ctxKey)
}
}

retrieveErr := pq.client.Batch(ctx, retrieveBatch...)
cleanupErr := pq.client.Batch(ctx, cleanupBatch...)

Expand All @@ -434,18 +480,35 @@
}

errCount := 0
for _, op := range retrieveBatch {
// only need to iterate over first half of batch if spancontext is persisted as these items
// are at corresponding index in the second half of retrieveBatch
for idx := 0; idx < len(dispatchedItems); idx++ {
op := retrieveBatch[idx]
if op.Value == nil {
pq.logger.Warn("Failed retrieving item", zap.String(zapKey, op.Key), zap.Error(errValueNotSet))
continue
}
restoredContext := ctx
req, err := pq.set.encoding.Unmarshal(op.Value)
// If error happened or item is nil, it will be efficiently ignored
if err != nil {
pq.logger.Warn("Failed unmarshalling item", zap.String(zapKey, op.Key), zap.Error(err))
continue
}
if pq.putInternal(ctx, req) != nil {
// We will then retrieve the context from the back half of the batch list, see above
if persistRequestContextFeatureGate.IsEnabled() {
ctxOp := retrieveBatch[idx+len(dispatchedItems)]
if ctxOp.Value != nil {
var rc requestContext
unmarshalErr := json.Unmarshal(ctxOp.Value, &rc)
if unmarshalErr == nil {
restoredContext = contextWithLocalSpanContext(restoredContext, rc.SpanContext)
} else {
pq.logger.Warn("Failed retrieving request context, storing empty span context", zap.String(zapKey, ctxOp.Key), zap.Error(unmarshalErr))
}
}
}
if pq.putInternal(restoredContext, req) != nil {
errCount++
}
}
Expand All @@ -470,9 +533,12 @@
}
}

setOp := storage.SetOperation(currentlyDispatchedItemsKey, itemIndexArrayToBytes(pq.metadata.CurrentlyDispatchedItems))
deleteOp := storage.DeleteOperation(getItemKey(index))
if err := pq.client.Batch(ctx, setOp, deleteOp); err != nil {
setOps := []*storage.Operation{storage.SetOperation(currentlyDispatchedItemsKey, itemIndexArrayToBytes(pq.metadata.CurrentlyDispatchedItems))}
deleteOps := []*storage.Operation{storage.DeleteOperation(getItemKey(index))}
if persistRequestContextFeatureGate.IsEnabled() {
deleteOps = append(deleteOps, storage.DeleteOperation(getContextKey(index)))
}
if err := pq.client.Batch(ctx, append(setOps, deleteOps...)...); err != nil {
// got an error, try to gracefully handle it
pq.logger.Warn("Failed updating currently dispatched items, trying to delete the item first",
zap.Error(err))
Expand All @@ -481,12 +547,12 @@
return nil
}

if err := pq.client.Batch(ctx, deleteOp); err != nil {
if err := pq.client.Batch(ctx, deleteOps...); err != nil {
// Return an error here, as this indicates an issue with the underlying storage medium
return fmt.Errorf("failed deleting item from queue, got error from storage: %w", err)
}

if err := pq.client.Batch(ctx, setOp); err != nil {
if err := pq.client.Batch(ctx, setOps...); err != nil {
// even if this fails, we still have the right dispatched items in memory
// at worst, we'll have the wrong list in storage, and we'll discard the nonexistent items during startup
return fmt.Errorf("failed updating currently dispatched items, but deleted item successfully: %w", err)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package queuebatch // import "go.opentelemetry.io/collector/exporter/exporterhelper/internal/queuebatch"

import (
"context"
"encoding/hex"
"encoding/json"
"errors"
"strconv"

"go.opentelemetry.io/otel/trace"

"go.opentelemetry.io/collector/featuregate"
)

const (
errInvalidTraceFlagsLength = "trace flags must only be 1 byte"
)

// persistRequestContextFeatureGate controls whether request context should be persisted in the queue.
var persistRequestContextFeatureGate = featuregate.GlobalRegistry().MustRegister(
"exporter.PersistRequestContext",
featuregate.StageAlpha,
featuregate.WithRegisterFromVersion("v0.127.0"),
featuregate.WithRegisterDescription("controls whether context should be stored alongside requests in the persistent queue"),
featuregate.WithRegisterReferenceURL("https://github.com/open-telemetry/opentelemetry-collector/pull/12934"),
)

// necessary due to SpanContext and SpanContextConfig not supporting Unmarshal interface,
// see https://github.com/open-telemetry/opentelemetry-go/issues/1819.
type spanContext struct {
TraceID string
SpanID string
TraceFlags string
TraceState string
Remote bool
}

func localSpanContextFromTraceSpanContext(sc trace.SpanContext) spanContext {
return spanContext{
TraceID: sc.TraceID().String(),
SpanID: sc.SpanID().String(),
TraceFlags: sc.TraceFlags().String(),
TraceState: sc.TraceState().String(),
Remote: sc.IsRemote(),
}
}

func contextWithLocalSpanContext(ctx context.Context, sc spanContext) context.Context {
traceID, err := trace.TraceIDFromHex(sc.TraceID)
if err != nil {
return ctx
}
spanID, err := trace.SpanIDFromHex(sc.SpanID)
if err != nil {
return ctx
}
traceFlags, err := traceFlagsFromHex(sc.TraceFlags)
if err != nil {
return ctx
}
traceState, err := trace.ParseTraceState(sc.TraceState)
if err != nil {
return ctx
}

return trace.ContextWithSpanContext(ctx, trace.NewSpanContext(trace.SpanContextConfig{
TraceID: traceID,
SpanID: spanID,
TraceFlags: *traceFlags,
TraceState: traceState,
Remote: sc.Remote,
}))
}

// requestContext wraps trace.SpanContext to allow for unmarshaling as well as
// future metadata key/value pairs to be added.
type requestContext struct {
SpanContext spanContext
}

// reverse of code in trace library https://github.com/open-telemetry/opentelemetry-go/blob/v1.35.0/trace/trace.go#L143-L168
func traceFlagsFromHex(hexStr string) (*trace.TraceFlags, error) {
decoded, err := hex.DecodeString(hexStr)
if err != nil {
return nil, err
}
if len(decoded) != 1 {
return nil, errors.New(errInvalidTraceFlagsLength)
}
traceFlags := trace.TraceFlags(decoded[0])
return &traceFlags, nil
}

func getAndMarshalSpanContext(ctx context.Context) ([]byte, error) {
if !persistRequestContextFeatureGate.IsEnabled() {
return nil, nil
}
rc := localSpanContextFromTraceSpanContext(trace.SpanContextFromContext(ctx))
return json.Marshal(requestContext{SpanContext: rc})
}

func getContextKey(index uint64) string {
return strconv.FormatUint(index, 10) + "_context"
}
Loading
Loading