redhat-developer · jmagak · Apr 8, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/...n-rhdh/assembly-configure-log-aggregation-and-observability-for-sonataflow.adoc b/...n-rhdh/assembly-configure-log-aggregation-and-observability-for-sonataflow.adoc
@@ -0,0 +1,38 @@
+:_mod-docs-content-type: ASSEMBLY
+ifdef::context[:parent-context: {context}]
+
+[id="configure-log-aggregation-and-observability-for-sonataflow_{context}"]
+= Configure log aggregation and observability for SonataFlow
+
+
+:context: configure-log-aggregation-and-observability-for-sonataflow
+
+[role="_abstract"]
+You must implement an observability strategy to make sure your serverless workflows are production-ready. By configuring structured JSON logging and integrating OpenTelemetry, you enable automated log aggregation, process-instance correlation, and distributed tracing.
+
+// Enable structured JSON logging for SonataFlow workflows
+include::../modules/extend_orchestrator-in-rhdh/proc-enable-structured-json-logging-for-sonataflow-workflows.adoc[leveloffset=+1]
+
+// Configure file-based JSON logging and rotation
+include::../modules/extend_orchestrator-in-rhdh/proc-configure-file-based-json-logging-and-log-rotation.adoc[leveloffset=+1]
+
+// Correlate logs with OpenTelemetry traces
+include::../modules/extend_orchestrator-in-rhdh/proc-correlate-logs-with-opentelemetry-traces.adoc[leveloffset=+1]
+
+// Aggregate logs using the PLG stack
+include::../modules/extend_orchestrator-in-rhdh/proc-aggregate-logs-using-the-plg-stack.adoc[leveloffset=+1]
+
+// Alerts for workflow health
+include::../modules/extend_orchestrator-in-rhdh/proc-configure-alerts-for-workflow-health.adoc[leveloffset=+1]
+
+// Integrate with external systems
+include::../modules/extend_orchestrator-in-rhdh/proc-integrate-workflows-with-external-systems.adoc[leveloffset=+1]
+
+// Troubleshoot observability and logging issues
+include::../modules/extend_orchestrator-in-rhdh/proc-troubleshoot-observability-and-logging-issues.adoc[leveloffset=+1]
+
+// Telemetry configuration properties
+include::../modules/extend_orchestrator-in-rhdh/ref-telemetry-configuration-properties.adoc[leveloffset=+1]
+ifdef::parent-context[:context: {parent-context}]
+ifndef::parent-context[:!context:]
+
diff --git a/...estrator-in-rhdh/assembly-configure-opentelemetry-for-sonataflow-workflows.adoc b/...estrator-in-rhdh/assembly-configure-opentelemetry-for-sonataflow-workflows.adoc
@@ -0,0 +1,26 @@
+:_mod-docs-content-type: ASSEMBLY
+ifdef::context[:parent-context: {context}]
+
+[id="configure-opentelemetry-for-sonataflow-workflows_{context}"]
+= Configure OpenTelemetry for SonataFlow workflows
+
+
+:context: configure-opentelemetry-for-sonataflow-workflows
+
+[role="_abstract"]
+To maintain high availability and performance of your serverless workflows, you must implement a comprehensive observability strategy. You can enable the OpenTelemetry extension, configure exporters for Jaeger and Loki, and interpret the generated telemetry data to monitor SonataFlow workflow health and performance.
+
+// Enable OpenTelemetry for SonataFlow workflows
+include::../modules/extend_orchestrator-in-rhdh/proc-enable-opentelemetry-for-sonataflow-workflows.adoc[leveloffset=+1]
+
+// Configuring telemetry exporters for Jaeger and Loki
+include::../modules/extend_orchestrator-in-rhdh/proc-configure-telemetry-exporters.adoc[leveloffset=+1]
+
+// SonataFlow OpenTelemetry attributes and events
+include::../modules/extend_orchestrator-in-rhdh/ref-observability-configuration-examples.adoc[leveloffset=+1]
+
+// Troubleshooting OpenTelemetry integration
+include::../modules/extend_orchestrator-in-rhdh/ref-troubleshoot-opentelemetry-connectivity.adoc[leveloffset=+1]
+ifdef::parent-context[:context: {parent-context}]
+ifndef::parent-context[:!context:]
+
diff --git a/modules/extend_orchestrator-in-rhdh/proc-aggregate-logs-using-the-plg-stack.adoc b/modules/extend_orchestrator-in-rhdh/proc-aggregate-logs-using-the-plg-stack.adoc
@@ -0,0 +1,242 @@
+:_mod-docs-content-type: PROCEDURE
+
+[id="aggregate-logs-using-the-plg-stack_{context}"]
+= Aggregate logs using the Promtail, Loki and Grafana (PLG) stack
+
+[role="_abstract"]
+Deploy and configure a Promtail sidecar to scrape workflow logs and push them to a Loki instance for storage and visualization in Grafana.
+
+.Prerequisites
+
+* You have running Loki and Grafana instances in the cluster.
+
+* You have configured workflow for file-based JSON logging.
+
+* You have `cluster-admin` permissions.
+
+.Procedure
+
+. Deploy the PLG stack using Helm:
++
+[source,bash]
+----
+# Add Grafana Helm repository
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+
+# Create namespace
+oc new-project sonataflow-observability
+
+# Deploy Loki stack
+helm install loki-stack grafana/loki-stack \
+  --namespace sonataflow-observability \
+  --set loki.persistence.enabled=true \
+  --set loki.persistence.size=20Gi \
+  --set promtail.config.logLevel=info \
+  --set grafana.enabled=true
+----
++
+[NOTE]
+====
+For production deployments, use a custom `values.yaml` file with appropriate resource limits and security contexts.
+====
+
+. Create a ConfigMap for the Promtail sidecar. Select the configuration that matches your logging method:
+
+
+... Scrape Container Stdout
++
+Use this configuration to collect logs from container stdout using Kubernetes service discovery:
++
+[source,yaml]
+----
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: promtail-config
+  namespace: sonataflow-observability
+data:
+  config.yml: |
+    server:
+      http_listen_port: 3101
+
+    clients:
+      - url: http://loki:3100/loki/api/v1/push
+
+    scrape_configs:
+    - job_name: sonataflow-workflows
+      kubernetes_sd_configs:
+      - role: pod
+        namespaces:
+          names: ["sonataflow-infra"]
+
+      relabel_configs:
+      - source_labels: [__meta_kubernetes_pod_label_sonataflow_org_workflow_app]
+        action: keep
+        regex: (.+)
+
+      - source_labels: [__meta_kubernetes_pod_name]
+        target_label: pod
+
+      - source_labels: [__meta_kubernetes_pod_label_sonataflow_org_workflow_app]
+        target_label: workflow
+
+      pipeline_stages:
+      - json:
+          expressions:
+            timestamp: timestamp
+            level: level
+            logger: logger
+            message: message
+            processInstanceId: mdc.processInstanceId
+            traceId: mdc.traceId
+            spanId: mdc.spanId
+
+      - labels:
+          level:
+          logger:
+          processInstanceId:
+          traceId:
+----
+
+... Scrape JSON log files
++
+If you use `[file-based JSON logging](#file-based-json-logging)`, configure Promtail to read from the shared log volume:
++
+[source,yaml]
+----
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: promtail-sidecar-config
+  namespace: sonataflow-infra
+data:
+  config.yml: |
+    server:
+      http_listen_port: 3101
+
+    clients:
+      - url: http://loki.sonataflow-observability.svc.cluster.local:3100/loki/api/v1/push
+
+    positions:
+      filename: /var/log/positions.yaml
+
+    scrape_configs:
+    - job_name: sonataflow-json-files
+      static_configs:
+      - targets:
+          - localhost
+        labels:
+          job: sonataflow-workflows
+          __path__: /var/log/sonataflow/*.log
+
+      pipeline_stages:
+      - json:
+          expressions:
+            timestamp: timestamp
+            level: level
+            logger: loggerName
+            message: message
+            processInstanceId: mdc.processInstanceId
+            traceId: mdc.traceId
+            spanId: mdc.spanId
+
+      - labels:
+          level:
+          logger:
+          processInstanceId:
+          traceId:
+
+      - timestamp:
+          source: timestamp
+          format: RFC3339Nano
+----
+
+. Add the Promtail sidecar container to your `SonataFlow` custom resource:
++
+[source,yaml]
+----
+apiVersion: sonataflow.org/v1alpha08
+kind: SonataFlow
+metadata:
+  name: my-workflow
+  namespace: sonataflow-infra
+spec:
+  podTemplate:
+    container:
+      volumeMounts:
+      - name: shared-logs
+        mountPath: /var/log/sonataflow
+    containers:
+    - name: promtail-sidecar
+      image: grafana/promtail:2.9.0
+      args:
+        - -config.file=/etc/promtail/config.yml
+      volumeMounts:
+      - name: shared-logs
+        mountPath: /var/log/sonataflow
+        readOnly: true
+      - name: promtail-config
+        mountPath: /etc/promtail
+      - name: positions
+        mountPath: /var/log
+      resources:
+        requests:
+          cpu: 50m
+          memory: 64Mi
+        limits:
+          cpu: 100m
+          memory: 128Mi
+    volumes:
+    - name: shared-logs
+      emptyDir:
+        sizeLimit: 500Mi
+    - name: promtail-config
+      configMap:
+        name: promtail-sidecar-config
+    - name: positions
+      emptyDir: {}
+----
++
+. Querying logs in Grafana: After deploying the stack, use the following LogQL queries in the Grafana **Explore** view:
++
+.. Filter logs by process instance
++
+[source,json,subs="+attributes,+quotes"]
+----
+{job="sonataflow-workflows"} | json | processInstanceId="abc-123-def-456"
+----
+
+.. Find workflow errors
++
+[source,json,subs="+attributes,+quotes"]
+----
+{job="sonataflow-workflows", workflow="onboarding"} | json | level="ERROR"
+----
+
+.. Trace correlation
++
+[source,json,subs="+attributes,+quotes"]
+----
+{job="sonataflow-workflows"} | json | traceId="4bf92f3577b34da6a3ce929d0e0e4736"
+----
+
+.. Process instance timeline
++
+[source,json,subs="+attributes,+quotes"]
+----
+{job="sonataflow-workflows"} | json | processInstanceId="abc-123-def-456" | line_format "{{.timestamp}} [{{.level}}] {{.message}}"
+----
+
+.Verification
+
+* Access the Grafana **Explore** view.
+
+* Run the following LogQL query, replacing `<instance_id>` with a valid ID:
++
+[source,json,subs="+attributes,+quotes"]
+----
+{job="sonataflow-workflows"} | json | processInstanceId="<instance_id>"
+----
++
+Expected result: Grafana displays the log entries associated with the specified process instance.
diff --git a/modules/extend_orchestrator-in-rhdh/proc-configure-alerts-for-workflow-health.adoc b/modules/extend_orchestrator-in-rhdh/proc-configure-alerts-for-workflow-health.adoc
@@ -0,0 +1,66 @@
+:_mod-docs-content-type: PROCEDURE
+
+[id="configure-alerts-for-workflow-conditions_{context}"]
+= Configure alerts for workflow conditions
+
+[role="_abstract"]
+Configure alerts to monitor SonataFlow workflows. These alerts notify you when workflows fail at high rates, when process instances are stuck, or when runtimes exceed expected thresholds.
+
+.Prerequisites
+
+* You have enabled a structured JSON logging to provide metadata for LogQL and PromQL queries.
+
+* You have installed a monitoring stack, such as Prometheus or Loki with Alertmanager in the cluster.
+
+.Procedure
+
+. Create a configuration file containing the following alert rule groups based on your monitoring requirements:
+
+** To monitor failure rates:
++
+[source,yaml,subs="+quotes,+attributes"]
+----
+- alert: WorkflowHighErrorRate
+  expr: rate({job="sonataflow-workflows", level="ERROR"}[5m]) > 0.1
+  for: 2m
+  labels:
+    severity: warning
+  annotations:
+    summary: "High error rate in SonataFlow workflows"
+----
+
+** To identify stuck process instances:
++
+[source,yaml,subs="+quotes,+attributes"]
+----
+- alert: WorkflowInstanceStuck
+  expr: |
+    time() - max by (process_instance_id) (
+      {job="sonataflow-workflows"} | json | unwrap timestamp[1h]
+    ) > 3600
+  labels:
+    severity: critical
+----
+
+** To identify workflows running for longer durations:
++
+[source,yaml,subs="+quotes,+attributes"]
+----
+  - alert: LongRunningWorkflow
+    expr: |
+      time() - min by (process_instance_id) (
+        {job="sonataflow-workflows"} | json | message="Workflow started" | unwrap timestamp[24h]
+      ) > 7200
+    labels:
+      severity: warning
+    annotations:
+      summary: "Workflow {{ $labels.process_instance_id }} running longer than 2 hours"
+----
+
+. Apply the alert rules to your cluster.
+
+.Verification
+
+* Access the monitoring dashboard, such as the Prometheus or OpenShift Console.
+
+* Verify that the alerts appear in the list under the *Alerts* tab.