BizFirst Observe
LogQL for BizFirstGO
Ready-to-use LogQL queries for the most common BizFirstGO observability scenarios. Copy these into Grafana Explore or use them as starting points for dashboard panels and alert rules.
Workflow Execution Queries
# All logs for a specific workflow execution
{job="processengine", environment="production"} |= "executionId=exec-d1e2f3a4"
# All error logs for a specific workflow execution
{job="processengine", environment="production", level="error"} |= "executionId=exec-d1e2f3a4"
# All logs from a specific workflow definition
{job="processengine"} | json | workflowId="wf-8a4c2f91"
# Workflow executions that timed out in the last hour
{job="processengine", level="error"} |= "timeout" | json | workflowId != ""
# All logs for a specific tenant in production
{job="processengine", tenant_id="tenant-abc-123", environment="production"}
Node Execution Queries
# All error logs from a specific node type
{job="processengine", level="error"} | json | nodeType="DataFetchNode"
# All logs from a specific node key (in any workflow)
{job="processengine"} | json | nodeKey="approval-node-01"
# Slowest node executions (duration_ms > 5000)
{job="processengine"} | json | duration_ms > 5000
# Node execution failures by node type (metric query — for alert)
sum(rate({job="processengine", level="error"} | json | nodeType != "" [5m])) by (nodeType)
HIL (Human-in-the-Loop) Queries
# All HIL suspension events
{job="processengine"} | json | message=~"(?i)suspended"
# HIL tasks awaiting approval for more than 24 hours (log-based)
{job="processengine"} |= "hil.suspend" |= "overdue"
# HIL approval events with outcome
{job="processengine"} | json | message=~"(?i)resumed" | hilOutcome != ""
# HIL timeout events (SLA breach)
{job="processengine", level="warn"} | json | hilOutcome="timeout"
Error Analysis Queries
# All unhandled exceptions in production
{job="processengine", environment="production", level="error"} |= "Exception"
# Error rate by tenant (metric query)
sum(rate({job="processengine", level="error"}[5m])) by (tenant_id)
# Top error messages (use with count_over_time)
topk(10, sum(count_over_time({job="processengine", level="error"} | json [5m])) by (message))
# Find a specific error message across all services
{environment="production", level="error"} |= "Connection refused"
# Find logs related to a specific trace (from alert notification)
{job="processengine"} |= "4bf92f3577b34da6a3ce929d0e0e4736"
EdgeStream Queries
# All EdgeStream errors
{job="edgestream", level="error"}
# Message delivery failures for a specific topic
{job="edgestream"} | json | topic="workflow.events" | message=~"(?i)fail|error"
# EdgeStream throughput by topic (metric query)
sum(rate({job="edgestream"} | json | topic != "" [1m])) by (topic)
Octopus Agent Queries
# All Octopus LLM call logs
{job="octopus"} | json | message=~"(?i)llm|ai call"
# Octopus errors with context
{job="octopus", level="error"} | json
# Memory access events
{job="octopus"} | json | message=~"(?i)memory"
Log Volume Dashboard Queries
# Total log ingestion rate by service (for capacity planning)
sum(rate({environment="production"}[5m])) by (job)
# Error log rate across all production services
sum(rate({environment="production", level="error"}[5m]))
# Log volume by level (stacked bar chart)
sum(count_over_time({environment="production"}[5m])) by (level)