Loki Alert Rules — Loki Log Aggregation

Ruler Configuration

# loki-config.yaml — enable the ruler
ruler:
  storage:
    type: local
    local:
      directory: /loki/rules
  rule_path: /loki/rules-temp
  alertmanager_url: http://alertmanager:9093
  ring:
    kvstore:
      store: inmemory
  enable_api: true

BizFirst Alert Rules

# /loki/rules/bizfirst/alert-rules.yaml
groups:
  - name: bizfirst-workflow-alerts
    interval: 1m
    rules:

      # Alert: High error rate in production
      - alert: BizFirstHighErrorRate
        expr: |
          (
            sum(rate({job="processengine", environment="production", level="error"}[5m]))
            /
            sum(rate({job="processengine", environment="production"}[5m]))
          ) > 0.05
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "High error rate in ProcessEngine"
          description: "Error rate is {{ printf \"%.1f\" $value | humanize }}% (threshold: 5%)"

      # Alert: Error spike (absolute count)
      - alert: BizFirstErrorSpike
        expr: |
          sum(count_over_time({job="processengine", level="error"}[1m])) > 50
        for: 2m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Error spike detected in ProcessEngine"

      # Alert: Node execution failures
      - alert: BizFirstNodeFailures
        expr: |
          sum by (nodeType) (
            rate({job="processengine", level="error"} | json | nodeType != "" [5m])
          ) > 0.1
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: "High failure rate for node type {{ $labels.nodeType }}"

      # Alert: HIL timeout detected in logs
      - alert: BizFirstHILTimeout
        expr: |
          sum(count_over_time({job="processengine"} |= "hil.timeout" [5m])) > 0
        for: 0m
        labels:
          severity: warning
          team: operations
        annotations:
          summary: "HIL task timeout detected"
          description: "A workflow HIL suspension has timed out without human action"

      # Alert: Critical exception (immediate)
      - alert: BizFirstCriticalException
        expr: |
          sum(count_over_time({job="processengine", level="fatal"}[1m])) > 0
        for: 0m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "Fatal exception in ProcessEngine"

Alert Routing via Alertmanager

# alertmanager.yml — routing for BizFirst log-based alerts
global:
  resolve_timeout: 5m

route:
  group_by: ['alertname', 'team']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 12h
  receiver: 'default'
  routes:
    - match:
        severity: critical
      receiver: 'pagerduty-critical'
    - match:
        team: operations
      receiver: 'slack-operations'
    - match:
        team: platform
      receiver: 'slack-platform'

receivers:
  - name: 'pagerduty-critical'
    pagerduty_configs:
      - service_key: "${PAGERDUTY_KEY}"

  - name: 'slack-platform'
    slack_configs:
      - api_url: "${SLACK_WEBHOOK_URL}"
        channel: '#platform-alerts'
        text: '{{ .GroupLabels.alertname }}: {{ .CommonAnnotations.summary }}'

  - name: 'default'
    email_configs:
      - to: 'platform@bizfirstai.com'

Prefer Prometheus Alerts for Metrics, Loki Alerts for Log Patterns

Use Loki alert rules for patterns that are only detectable in log content — specific error messages, security events, or business logic violations. Use Prometheus alert rules for metric thresholds — error rates, latency percentiles, queue depths. Both feed into the same Alertmanager instance, so routing and deduplication works identically for both sources.

← Log Retention in Loki