PromQL for BizFirst — Prometheus Metrics

Workflow Health

# Overall workflow execution rate (executions per second)
sum(rate(bizfirst_workflow_executions_total[5m]))

# Workflow error rate (fraction of failed executions)
sum(rate(bizfirst_workflow_executions_total{status="failed"}[5m]))
  /
sum(rate(bizfirst_workflow_executions_total[5m]))

# Workflow error rate per tenant (for multi-tenant dashboards)
sum(rate(bizfirst_workflow_executions_total{status="failed"}[5m])) by (tenant_id)
  /
sum(rate(bizfirst_workflow_executions_total[5m])) by (tenant_id)

# Active (in-progress) executions
sum(bizfirst_active_executions)

# Workflow success rate (SLA dashboard)
(
  sum(rate(bizfirst_workflow_executions_total{status="success"}[24h]))
  /
  sum(rate(bizfirst_workflow_executions_total[24h]))
) * 100

Node Performance

# P99 latency per node type
histogram_quantile(0.99,
  sum(rate(bizfirst_node_execution_duration_seconds_bucket[5m])) by (node_type, le)
)

# Average latency per node type
sum(rate(bizfirst_node_execution_duration_seconds_sum[5m])) by (node_type)
  /
sum(rate(bizfirst_node_execution_duration_seconds_count[5m])) by (node_type)

# Slowest node types (top 5 by P99)
topk(5,
  histogram_quantile(0.99,
    sum(rate(bizfirst_node_execution_duration_seconds_bucket[5m])) by (node_type, le)
  )
)

# Node error rate by type
sum(rate(bizfirst_node_errors_total[5m])) by (node_type)
  /
sum(rate(bizfirst_node_executions_total[5m])) by (node_type)

HIL Metrics

# Total HIL backlog across all tenants
sum(bizfirst_hil_pending_count)

# HIL backlog by tenant
bizfirst_hil_pending_count

# Percentage of HIL tasks that are overdue
(sum(bizfirst_hil_overdue_count) / sum(bizfirst_hil_pending_count)) * 100

# P95 HIL suspension duration
histogram_quantile(0.95,
  sum(rate(bizfirst_hil_suspension_duration_seconds_bucket[1h])) by (le)
)

# HIL approval rate
sum(rate(bizfirst_hil_suspensions_total[5m])) by (outcome)

EdgeStream Throughput

# Messages per second by topic
sum(rate(bizfirst_edgestream_messages_total[1m])) by (topic)

# P99 delivery latency by topic
histogram_quantile(0.99,
  sum(rate(bizfirst_edgestream_delivery_duration_seconds_bucket[5m])) by (topic, le)
)

# Message delivery failure rate
sum(rate(bizfirst_edgestream_messages_total{status="failed"}[5m]))
  /
sum(rate(bizfirst_edgestream_messages_total[5m]))

# Subscriber count per topic
bizfirst_edgestream_subscriber_count

Octopus AI Metrics

# LLM call rate per second
sum(rate(bizfirst_octopus_llm_calls_total[1m])) by (model)

# P99 LLM response latency
histogram_quantile(0.99,
  sum(rate(bizfirst_octopus_llm_duration_seconds_bucket[5m])) by (model, le)
)

# Token usage rate (cost tracking)
sum(rate(bizfirst_octopus_tokens_total[1h])) by (model, type)

# Active agent sessions
sum(bizfirst_octopus_active_agents) by (tenant_id)

← PromQL Basics Next: Recording Rules →