BizFirst Observe
PromQL for BizFirstGO
Ready-to-use PromQL queries for BizFirstGO observability. Use these in Grafana dashboards, Prometheus alert rules, and ad-hoc Explore queries.
Workflow Health
# Overall workflow execution rate (executions per second)
sum(rate(bizfirst_workflow_executions_total[5m]))
# Workflow error rate (fraction of failed executions)
sum(rate(bizfirst_workflow_executions_total{status="failed"}[5m]))
/
sum(rate(bizfirst_workflow_executions_total[5m]))
# Workflow error rate per tenant (for multi-tenant dashboards)
sum(rate(bizfirst_workflow_executions_total{status="failed"}[5m])) by (tenant_id)
/
sum(rate(bizfirst_workflow_executions_total[5m])) by (tenant_id)
# Active (in-progress) executions
sum(bizfirst_active_executions)
# Workflow success rate (SLA dashboard)
(
sum(rate(bizfirst_workflow_executions_total{status="success"}[24h]))
/
sum(rate(bizfirst_workflow_executions_total[24h]))
) * 100
Node Performance
# P99 latency per node type
histogram_quantile(0.99,
sum(rate(bizfirst_node_execution_duration_seconds_bucket[5m])) by (node_type, le)
)
# Average latency per node type
sum(rate(bizfirst_node_execution_duration_seconds_sum[5m])) by (node_type)
/
sum(rate(bizfirst_node_execution_duration_seconds_count[5m])) by (node_type)
# Slowest node types (top 5 by P99)
topk(5,
histogram_quantile(0.99,
sum(rate(bizfirst_node_execution_duration_seconds_bucket[5m])) by (node_type, le)
)
)
# Node error rate by type
sum(rate(bizfirst_node_errors_total[5m])) by (node_type)
/
sum(rate(bizfirst_node_executions_total[5m])) by (node_type)
HIL Metrics
# Total HIL backlog across all tenants
sum(bizfirst_hil_pending_count)
# HIL backlog by tenant
bizfirst_hil_pending_count
# Percentage of HIL tasks that are overdue
(sum(bizfirst_hil_overdue_count) / sum(bizfirst_hil_pending_count)) * 100
# P95 HIL suspension duration
histogram_quantile(0.95,
sum(rate(bizfirst_hil_suspension_duration_seconds_bucket[1h])) by (le)
)
# HIL approval rate
sum(rate(bizfirst_hil_suspensions_total[5m])) by (outcome)
EdgeStream Throughput
# Messages per second by topic
sum(rate(bizfirst_edgestream_messages_total[1m])) by (topic)
# P99 delivery latency by topic
histogram_quantile(0.99,
sum(rate(bizfirst_edgestream_delivery_duration_seconds_bucket[5m])) by (topic, le)
)
# Message delivery failure rate
sum(rate(bizfirst_edgestream_messages_total{status="failed"}[5m]))
/
sum(rate(bizfirst_edgestream_messages_total[5m]))
# Subscriber count per topic
bizfirst_edgestream_subscriber_count
Octopus AI Metrics
# LLM call rate per second
sum(rate(bizfirst_octopus_llm_calls_total[1m])) by (model)
# P99 LLM response latency
histogram_quantile(0.99,
sum(rate(bizfirst_octopus_llm_duration_seconds_bucket[5m])) by (model, le)
)
# Token usage rate (cost tracking)
sum(rate(bizfirst_octopus_tokens_total[1h])) by (model, type)
# Active agent sessions
sum(bizfirst_octopus_active_agents) by (tenant_id)