Pruning Strategies — Working Memory

Strategy 1: FIFO (First In, First Out)

The simplest strategy — oldest messages are dropped when the budget is exceeded. Fast and deterministic.

public class FIFOPruner : ContextPruner
{
    public override Task<IReadOnlyList<LLMMessage>> PruneAsync(
        IReadOnlyList<LLMMessage> history,
        int targetTokens,
        CancellationToken ct)
    {
        var messages = history.ToList();
        // Remove oldest user-assistant pairs until within budget
        while (_tokenCounter.CountMessages(messages) > targetTokens
               && messages.Count > _config.MinHistoryMessages)
        {
            // Remove oldest pair (indices 0 and 1)
            messages.RemoveAt(0);
            if (messages.Count > 0 && messages[0].Role == Role.Assistant)
                messages.RemoveAt(0);
        }
        return Task.FromResult<IReadOnlyList<LLMMessage>>(messages);
    }
}

// Best for: most agents — simple, no extra LLM calls
// Drawback: early context is permanently lost — no recovery

Strategy 2: Summarize

When old messages are pruned, a summary LLM call condenses them into a compact summary message that replaces the removed turns. The summary is prepended to the remaining history, preserving key information.

public class SummarizePruner : ContextPruner
{
    public override async Task<IReadOnlyList<LLMMessage>> PruneAsync(
        IReadOnlyList<LLMMessage> history,
        int targetTokens, CancellationToken ct)
    {
        // Identify messages to prune (oldest 30%)
        int pruneCount = (int)(history.Count * 0.3);
        var toSummarize = history.Take(pruneCount).ToList();
        var toKeep = history.Skip(pruneCount).ToList();

        // Generate summary of pruned messages
        var summaryPrompt = $"Summarize this conversation history concisely:\n{FormatMessages(toSummarize)}";
        var summary = await _summaryLLM.CompleteAsync(
            new[] { new LLMMessage(Role.User, summaryPrompt) },
            tools: null, new LLMOptions { MaxOutputTokens = 300 }, ct);

        // Prepend summary message to remaining history
        var summaryMessage = new LLMMessage(Role.System,
            $"[Earlier conversation summary]: {summary.Content}");

        return new[] { summaryMessage }.Concat(toKeep).ToList();
    }
}

// Best for: agents with tight budgets needing coherence over many turns
// Drawback: extra LLM call cost + latency per pruning event

Strategy 3: SlidingWindow

Always keeps only the last N user-assistant turn pairs, regardless of token count. Predictable and fast.

public class SlidingWindowPruner : ContextPruner
{
    public override Task<IReadOnlyList<LLMMessage>> PruneAsync(
        IReadOnlyList<LLMMessage> history,
        int targetTokens, CancellationToken ct)
    {
        int windowTurns = _config.SlidingWindowTurns;  // e.g., 10 last turns
        // Group messages into user-assistant pairs
        var pairs = GroupIntoPairs(history);
        var lastNPairs = pairs.TakeLast(windowTurns)
            .SelectMany(p => new[] { p.User, p.Assistant })
            .ToList();
        return Task.FromResult<IReadOnlyList<LLMMessage>>(lastNPairs);
    }
}

// Best for: short task agents where early context is irrelevant
// Drawback: no awareness of anything before the last N turns

Comparison Table

Strategy	Cost	Latency	Context Retention	Best For
FIFO	Zero	~0ms	Recent turns only	General purpose
Summarize	1 extra LLM call	+200–500ms	Summary of past + recent turns	Long conversations requiring continuity
SlidingWindow	Zero	~0ms	Last N turns only	Task-focused short sessions

← History Management Next: Knowledge Injection →