Portal Community

Workflow Execution Queries

# All logs for a specific workflow execution
{job="processengine", environment="production"} |= "executionId=exec-d1e2f3a4"

# All error logs for a specific workflow execution
{job="processengine", environment="production", level="error"} |= "executionId=exec-d1e2f3a4"

# All logs from a specific workflow definition
{job="processengine"} | json | workflowId="wf-8a4c2f91"

# Workflow executions that timed out in the last hour
{job="processengine", level="error"} |= "timeout" | json | workflowId != ""

# All logs for a specific tenant in production
{job="processengine", tenant_id="tenant-abc-123", environment="production"}

Node Execution Queries

# All error logs from a specific node type
{job="processengine", level="error"} | json | nodeType="DataFetchNode"

# All logs from a specific node key (in any workflow)
{job="processengine"} | json | nodeKey="approval-node-01"

# Slowest node executions (duration_ms > 5000)
{job="processengine"} | json | duration_ms > 5000

# Node execution failures by node type (metric query — for alert)
sum(rate({job="processengine", level="error"} | json | nodeType != "" [5m])) by (nodeType)

HIL (Human-in-the-Loop) Queries

# All HIL suspension events
{job="processengine"} | json | message=~"(?i)suspended"

# HIL tasks awaiting approval for more than 24 hours (log-based)
{job="processengine"} |= "hil.suspend" |= "overdue"

# HIL approval events with outcome
{job="processengine"} | json | message=~"(?i)resumed" | hilOutcome != ""

# HIL timeout events (SLA breach)
{job="processengine", level="warn"} | json | hilOutcome="timeout"

Error Analysis Queries

# All unhandled exceptions in production
{job="processengine", environment="production", level="error"} |= "Exception"

# Error rate by tenant (metric query)
sum(rate({job="processengine", level="error"}[5m])) by (tenant_id)

# Top error messages (use with count_over_time)
topk(10, sum(count_over_time({job="processengine", level="error"} | json [5m])) by (message))

# Find a specific error message across all services
{environment="production", level="error"} |= "Connection refused"

# Find logs related to a specific trace (from alert notification)
{job="processengine"} |= "4bf92f3577b34da6a3ce929d0e0e4736"

EdgeStream Queries

# All EdgeStream errors
{job="edgestream", level="error"}

# Message delivery failures for a specific topic
{job="edgestream"} | json | topic="workflow.events" | message=~"(?i)fail|error"

# EdgeStream throughput by topic (metric query)
sum(rate({job="edgestream"} | json | topic != "" [1m])) by (topic)

Octopus Agent Queries

# All Octopus LLM call logs
{job="octopus"} | json | message=~"(?i)llm|ai call"

# Octopus errors with context
{job="octopus", level="error"} | json

# Memory access events
{job="octopus"} | json | message=~"(?i)memory"

Log Volume Dashboard Queries

# Total log ingestion rate by service (for capacity planning)
sum(rate({environment="production"}[5m])) by (job)

# Error log rate across all production services
sum(rate({environment="production", level="error"}[5m]))

# Log volume by level (stacked bar chart)
sum(count_over_time({environment="production"}[5m])) by (level)