Portal Community

Why Chunking Matters

A document cannot be embedded as a whole — embedding models have input limits, and a 100-page policy document would produce a single vector that averages all its topics into noise. Chunking creates granular, topic-focused segments that embed well and retrieve with precision.

IChunker Interface

public interface IChunker
{
    IReadOnlyList<TextChunk> Chunk(string text, ChunkingConfig config);
}

public class TextChunk
{
    public int    Index     { get; init; }   // Position in original document
    public string Content   { get; init; } = string.Empty;
    public int    StartChar { get; init; }
    public int    EndChar   { get; init; }
    public int    TokenCount{ get; init; }
}

public class ChunkingConfig
{
    public string Strategy  { get; set; } = "FixedSize";  // FixedSize | Sentence | Semantic
    public int    ChunkSize { get; set; } = 512;           // Max tokens per chunk
    public int    Overlap   { get; set; } = 64;            // Overlap tokens between chunks
}

Strategy 1: FixedSize

Splits text into fixed-size token windows with overlap. Fast and predictable. The overlap ensures context is not lost at chunk boundaries:

public class FixedSizeChunker : IChunker
{
    public IReadOnlyList<TextChunk> Chunk(string text, ChunkingConfig config)
    {
        var tokens = _tokenizer.Tokenize(text);
        var chunks = new List<TextChunk>();
        int step  = config.ChunkSize - config.Overlap;

        for (int i = 0; i < tokens.Count; i += step)
        {
            var slice = tokens.Skip(i).Take(config.ChunkSize).ToList();
            chunks.Add(new TextChunk
            {
                Index      = chunks.Count,
                Content    = _tokenizer.Decode(slice),
                TokenCount = slice.Count
            });
            if (i + config.ChunkSize >= tokens.Count) break;
        }
        return chunks;
    }
}

// Best for: dense reference docs (policy PDFs, technical manuals)
// Drawback: may split mid-sentence

Strategy 2: Sentence

Respects sentence boundaries — never splits mid-sentence. Groups sentences into chunks until the token limit is reached, with sentence-level overlap:

public class SentenceChunker : IChunker
{
    public IReadOnlyList<TextChunk> Chunk(string text, ChunkingConfig config)
    {
        // Split into sentences using NLP sentence detector
        var sentences = _sentenceDetector.Split(text);
        var chunks    = new List<TextChunk>();
        var current   = new List<string>();
        int currentTokens = 0;

        foreach (var sentence in sentences)
        {
            int sentTokens = _counter.Count(sentence);
            if (currentTokens + sentTokens > config.ChunkSize && current.Any())
            {
                chunks.Add(BuildChunk(current, chunks.Count));
                // Overlap: keep last N sentences
                var overlapSentences = current.TakeLast(2).ToList();
                current = overlapSentences;
                currentTokens = overlapSentences.Sum(s => _counter.Count(s));
            }
            current.Add(sentence);
            currentTokens += sentTokens;
        }

        if (current.Any()) chunks.Add(BuildChunk(current, chunks.Count));
        return chunks;
    }
}

// Best for: narrative documents, articles, FAQs
// Drawback: chunks may vary significantly in size

Strategy 3: Semantic

Groups sentences that are semantically similar using embedding similarity. Sentences with a large semantic shift trigger a new chunk boundary:

public class SemanticChunker : IChunker
{
    public IReadOnlyList<TextChunk> Chunk(string text, ChunkingConfig config)
    {
        var sentences = _detector.Split(text);
        var embeddings = _embedder.EmbedBatch(sentences);  // Synchronous batch

        var chunks   = new List<TextChunk>();
        var current  = new List<string> { sentences[0] };

        for (int i = 1; i < sentences.Count; i++)
        {
            float similarity = CosineSimilarity(embeddings[i - 1], embeddings[i]);
            bool semanticBreak = similarity < 0.75f;  // Configurable threshold

            if (semanticBreak || _counter.Count(string.Concat(current)) > config.ChunkSize)
            {
                chunks.Add(BuildChunk(current, chunks.Count));
                current = new List<string> { sentences[i] };
            }
            else
            {
                current.Add(sentences[i]);
            }
        }

        if (current.Any()) chunks.Add(BuildChunk(current, chunks.Count));
        return chunks;
    }
}

// Best for: diverse topic documents (annual reports, multi-section guides)
// Drawback: requires embedding at chunk time (slower ingestion)

Strategy Comparison

StrategyBoundary AwarenessIngestion SpeedRetrieval PrecisionBest For
FixedSizeNone (token boundary)FastGood for dense textReference manuals, policy documents
SentenceSentence boundaryFastBetter for natural languageFAQs, articles, conversational content
SemanticSemantic topic boundarySlow (needs embedding at index time)Best — topic-aligned chunksMixed-topic documents, annual reports