Chunking Strategies — RAG

Why Chunking Matters

A document cannot be embedded as a whole — embedding models have input limits, and a 100-page policy document would produce a single vector that averages all its topics into noise. Chunking creates granular, topic-focused segments that embed well and retrieve with precision.

Too small (e.g. 50 tokens): Chunks lose context; answers require too many retrievals to assemble
Too large (e.g. 2000 tokens): Chunks cover too many topics; low precision; token cost per retrieval is high
Optimal (256–512 tokens): Single coherent idea per chunk; precise retrieval with manageable token cost

IChunker Interface

public interface IChunker
{
    IReadOnlyList<TextChunk> Chunk(string text, ChunkingConfig config);
}

public class TextChunk
{
    public int    Index     { get; init; }   // Position in original document
    public string Content   { get; init; } = string.Empty;
    public int    StartChar { get; init; }
    public int    EndChar   { get; init; }
    public int    TokenCount{ get; init; }
}

public class ChunkingConfig
{
    public string Strategy  { get; set; } = "FixedSize";  // FixedSize | Sentence | Semantic
    public int    ChunkSize { get; set; } = 512;           // Max tokens per chunk
    public int    Overlap   { get; set; } = 64;            // Overlap tokens between chunks
}

Strategy 1: FixedSize

Splits text into fixed-size token windows with overlap. Fast and predictable. The overlap ensures context is not lost at chunk boundaries:

public class FixedSizeChunker : IChunker
{
    public IReadOnlyList<TextChunk> Chunk(string text, ChunkingConfig config)
    {
        var tokens = _tokenizer.Tokenize(text);
        var chunks = new List<TextChunk>();
        int step  = config.ChunkSize - config.Overlap;

        for (int i = 0; i < tokens.Count; i += step)
        {
            var slice = tokens.Skip(i).Take(config.ChunkSize).ToList();
            chunks.Add(new TextChunk
            {
                Index      = chunks.Count,
                Content    = _tokenizer.Decode(slice),
                TokenCount = slice.Count
            });
            if (i + config.ChunkSize >= tokens.Count) break;
        }
        return chunks;
    }
}

// Best for: dense reference docs (policy PDFs, technical manuals)
// Drawback: may split mid-sentence

Strategy 2: Sentence

Respects sentence boundaries — never splits mid-sentence. Groups sentences into chunks until the token limit is reached, with sentence-level overlap:

public class SentenceChunker : IChunker
{
    public IReadOnlyList<TextChunk> Chunk(string text, ChunkingConfig config)
    {
        // Split into sentences using NLP sentence detector
        var sentences = _sentenceDetector.Split(text);
        var chunks    = new List<TextChunk>();
        var current   = new List<string>();
        int currentTokens = 0;

        foreach (var sentence in sentences)
        {
            int sentTokens = _counter.Count(sentence);
            if (currentTokens + sentTokens > config.ChunkSize && current.Any())
            {
                chunks.Add(BuildChunk(current, chunks.Count));
                // Overlap: keep last N sentences
                var overlapSentences = current.TakeLast(2).ToList();
                current = overlapSentences;
                currentTokens = overlapSentences.Sum(s => _counter.Count(s));
            }
            current.Add(sentence);
            currentTokens += sentTokens;
        }

        if (current.Any()) chunks.Add(BuildChunk(current, chunks.Count));
        return chunks;
    }
}

// Best for: narrative documents, articles, FAQs
// Drawback: chunks may vary significantly in size

Strategy 3: Semantic

Groups sentences that are semantically similar using embedding similarity. Sentences with a large semantic shift trigger a new chunk boundary:

public class SemanticChunker : IChunker
{
    public IReadOnlyList<TextChunk> Chunk(string text, ChunkingConfig config)
    {
        var sentences = _detector.Split(text);
        var embeddings = _embedder.EmbedBatch(sentences);  // Synchronous batch

        var chunks   = new List<TextChunk>();
        var current  = new List<string> { sentences[0] };

        for (int i = 1; i < sentences.Count; i++)
        {
            float similarity = CosineSimilarity(embeddings[i - 1], embeddings[i]);
            bool semanticBreak = similarity < 0.75f;  // Configurable threshold

            if (semanticBreak || _counter.Count(string.Concat(current)) > config.ChunkSize)
            {
                chunks.Add(BuildChunk(current, chunks.Count));
                current = new List<string> { sentences[i] };
            }
            else
            {
                current.Add(sentences[i]);
            }
        }

        if (current.Any()) chunks.Add(BuildChunk(current, chunks.Count));
        return chunks;
    }
}

// Best for: diverse topic documents (annual reports, multi-section guides)
// Drawback: requires embedding at chunk time (slower ingestion)

Strategy Comparison

Strategy	Boundary Awareness	Ingestion Speed	Retrieval Precision	Best For
FixedSize	None (token boundary)	Fast	Good for dense text	Reference manuals, policy documents
Sentence	Sentence boundary	Fast	Better for natural language	FAQs, articles, conversational content
Semantic	Semantic topic boundary	Slow (needs embedding at index time)	Best — topic-aligned chunks	Mixed-topic documents, annual reports

← Document Ingestion Next: Embedding →