Portal Community

Retry Context

The engine may retry a node multiple times before declaring failure. OnErrorAsync fires on every attempt that throws — including intermediate retry attempts. Use args.RetryCount to distinguish the first failure from a final failure after exhausted retries.

public async Task OnErrorAsync(NodeExecutionEventArgs args, CancellationToken ct)
{
    var isFinalAttempt = args.RetryCount >= GetMaxRetries(args.NodeType);

    if (isFinalAttempt)
    {
        // Only create an incident on final failure, not on transient retries
        await _incidentService.CreateAsync(new Incident
        {
            Title      = $"Node {args.NodeType} permanently failed",
            ExecutionId= args.ExecutionId,
            NodeId     = args.NodeId,
            Error      = args.Exception!.ToString(),
            Severity   = IncidentSeverity.High
        }, ct);
    }
    else
    {
        // Log transient failure at debug level only
        _logger.LogDebug("Node {NodeId} attempt {Retry} failed: {Error}",
            args.NodeId, args.RetryCount, args.Exception!.Message);
    }
}

Error Catalogue Pattern

public async Task OnErrorAsync(NodeExecutionEventArgs args, CancellationToken ct)
{
    await _errorCatalogue.RecordAsync(new NodeError
    {
        ExecutionId  = args.ExecutionId,
        ProcessId    = args.ProcessId,
        TenantId     = args.TenantId,
        NodeId       = args.NodeId,
        NodeType     = args.NodeType,
        ExceptionType= args.Exception!.GetType().Name,
        Message      = args.Exception.Message,
        StackTrace   = args.Exception.StackTrace,
        RetryCount   = args.RetryCount,
        OccurredAt   = DateTimeOffset.UtcNow
    }, ct);
}

Exception Type Routing

Differentiate handling based on exception type:

public async Task OnErrorAsync(NodeExecutionEventArgs args, CancellationToken ct)
{
    switch (args.Exception)
    {
        case HttpRequestException httpEx:
            await _metrics.IncrementCounterAsync("node.error.http",
                ("status_code", httpEx.StatusCode?.ToString() ?? "unknown"));
            break;

        case TimeoutException:
            await _metrics.IncrementCounterAsync("node.error.timeout",
                ("node_type", args.NodeType));
            break;

        case NodeConfigurationException configEx:
            // Configuration errors are bugs — always page on-call
            await _oncall.PageAsync($"Config error in {args.NodeType}: {configEx.Message}");
            break;

        default:
            await _metrics.IncrementCounterAsync("node.error.unknown",
                ("exception_type", args.Exception!.GetType().Name));
            break;
    }
}

Availability Table

PropertyAvailable in OnError?
args.ExceptionYes — never null in OnError
args.RetryCountYes — 0 on first attempt
args.DurationMsYes — time elapsed before throw
args.Resultnull — executor did not return
args.ContextYes — context snapshot at time of throw
Subscriber exceptions are swallowed: If your OnError implementation itself throws, that exception is caught by the dispatcher and logged, but does not affect the workflow. The original executor exception continues to propagate.