docuindex

package module
v0.0.14 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 11, 2026 License: MIT Imports: 25 Imported by: 0

README

DocuIndex

A pure Go package for parsing PDF and DOCX files and extracting structured content optimized for AI search and RAG (Retrieval-Augmented Generation) applications.

** This package is under very active development. Expect frequent updates and improvements. Not yet stable. **

Features

  • Pure Go - No CGO or external dependencies
  • PDF Parsing - Complete PDF parser with PostScript content stream interpreter
  • DOCX Parsing - Full DOCX support via ZIP/XML parsing with style resolution
  • Custom Data Sources - Index arbitrary structured data with tag-based filtering
  • Text Extraction - Extract text with positioning, font info, and semantic structure
  • Image Extraction - Extract embedded images (JPEG, PNG, GIF, BMP, TIFF)
  • Semantic Analysis - Automatic heading detection, section tracking, keyword extraction
  • SQLite Storage - Unified SQLite database for all metadata and search indices
  • Hybrid Search - BM25 keyword search + vector semantic search with RRF fusion
  • Embedding Providers - Azure OpenAI, OpenAI, and Ollama support
  • Thread-Safe - Safe for concurrent use

Installation

go get github.com/mmalcek/docuIndex

Quick Start

package main

import (
    "fmt"
    "log"

    "github.com/mmalcek/docuIndex"
)

func main() {
    // Create a store (documents will be saved to ./data directory)
    store, err := docuindex.NewStore("./data")
    if err != nil {
        log.Fatal(err)
    }
    defer store.Close()

    // Index a PDF or DOCX file
    doc, err := store.IndexDocument("./document.pdf")   // PDF
    // doc, err := store.IndexDocument("./document.docx") // DOCX
    if err != nil {
        log.Fatal(err)
    }
    fmt.Printf("Indexed document: %s (%d pages)\n", doc.Info.Name, doc.Info.PageCount)

    // Search across all documents
    results, err := store.Search("search query")
    if err != nil {
        log.Fatal(err)
    }

    for _, r := range results.Results {
        fmt.Printf("Found in %s (page %d): %s\n", r.DocumentName, r.Page, r.Snippet)
    }
}

API Reference

Store Operations
Create a Store
// Basic store
store, err := docuindex.NewStore("./data")

// With options
store, err := docuindex.NewStore("./data",
    docuindex.WithImageExtraction(true),    // Extract images from PDFs
    docuindex.WithChecksum(true),           // Compute SHA-256 checksums
    docuindex.WithSemanticAnalysis(true),   // Enable heading/section detection
    docuindex.WithStemming(true),           // Enable Porter stemming for search
    docuindex.WithStopWords(true),          // Filter common stop words
)
Configure Embedding Provider (Optional)
import "github.com/mmalcek/docuIndex/embedding"

// Azure OpenAI with API key (defaults to API version 2024-10-21)
provider, err := embedding.NewProvider(embedding.Config{
    Provider:   "azure",
    Endpoint:   os.Getenv("AZURE_ENDPOINT"),
    APIKey:     os.Getenv("AZURE_API_KEY"),
    Model:      "text-embedding-3-small",
    // APIVersion: "v1",           // Optional: use new v1 API format
    // APIVersion: "2024-10-21",   // Optional: explicit version (default)
})

// Azure OpenAI with Azure Identity (Managed Identity, DefaultAzureCredential, etc.)
// Optional: requires github.com/Azure/azure-sdk-for-go/sdk/azidentity
import "github.com/Azure/azure-sdk-for-go/sdk/azidentity"

cred, err := azidentity.NewDefaultAzureCredential(nil)
provider, err := embedding.NewProvider(embedding.Config{
    Provider:        "azure",
    Endpoint:        os.Getenv("AZURE_ENDPOINT"),
    Model:           "text-embedding-3-small",
    TokenCredential: cred,  // Uses Bearer token instead of api-key header
})
// Tokens are cached and automatically refreshed before expiry

// OpenAI
provider, err := embedding.NewProvider(embedding.Config{
    Provider: "openai",
    APIKey:   os.Getenv("OPENAI_API_KEY"),
    Model:    "text-embedding-3-small",
})

// Ollama (local)
provider, err := embedding.NewProvider(embedding.Config{
    Provider: "ollama",
    Endpoint: "http://localhost:11434",
    Model:    "nomic-embed-text",
})

// Add to store for semantic search
store.SetEmbeddingProvider(provider)
Index Documents
// Index from file path (PDF or DOCX)
doc, err := store.IndexDocument("./document.pdf")
doc, err := store.IndexDocument("./document.docx")

// Index from io.Reader
file, _ := os.Open("document.pdf")
doc, err := store.IndexReader(file, "document.pdf")

file, _ := os.Open("document.docx")
doc, err := store.IndexReader(file, "document.docx")

// With custom name
doc, err := store.IndexDocument("./document.pdf",
    docuindex.WithName("My Custom Name"),
)

// With custom source and tags for filtering
doc, err := store.IndexDocument("./document.pdf",
    docuindex.WithIndexSource("knowledgebase"),
    docuindex.WithIndexTags(map[string]string{
        "department": "engineering",
        "project":    "alpha",
    }),
)
// Now searchable with:
// store.Search("query", docuindex.WithSources("knowledgebase"))
// store.Search("query", docuindex.WithTags(map[string]string{"department": "engineering"}))

// With deferred embedding (for bulk imports - embed later with EmbedPendingDocuments)
doc, err := store.IndexDocument("./document.pdf",
    docuindex.WithDeferEmbedding(true),
)
Index Custom Data
// Index structured data from any source (creates new document each time)
doc, err := store.IndexCustomData(&docuindex.CustomData{
    Source:      "crm-api",
    Name:        "Customer Notes Q4 2024",
    Description: "Extracted customer interaction notes",
    Tags: map[string]string{
        "quarter": "Q4-2024",
        "type":    "customer-notes",
    },
    ImportedAt: time.Now(), // Optional: track import time for incremental updates
    Entries: []docuindex.DataEntry{
        {Content: "Meeting with Acme Corp about renewal..."},
        {Content: "Support ticket #1234: User reported issue..."},
        {Content: "Sales call summary: Interested in enterprise plan..."},
    },
})

// Index custom data with images
doc, err := store.IndexCustomData(&docuindex.CustomData{
    Source: "products",
    Name:   "Product Catalog",
    Entries: []docuindex.DataEntry{
        {
            ID:      "laptop-001",
            Content: "MacBook Pro 16-inch with M3 chip, 36GB RAM",
            Images: []docuindex.CustomImage{
                {
                    Data:        jpegBytes,           // Image bytes (required)
                    Format:      "jpeg",              // "png", "jpeg", "gif", "bmp" (required)
                    Description: "Front view",       // AI-friendly alt text (optional)
                    OriginalName: "macbook.jpg",     // Display name (optional)
                    // Width/Height auto-detected if not provided
                },
            },
        },
    },
    // Document-level images (not tied to specific entry)
    Images: []docuindex.CustomImage{
        {Data: logoPng, Format: "png", Description: "Company logo"},
    },
})

// Upsert custom data (update existing document if source + external_id match)
// On update, existing images are automatically replaced
doc, err := store.UpsertCustomData(&docuindex.CustomData{
    Source:      "salesforce-api",
    Name:        "Salesforce Opportunities",
    ExternalID:  "opportunities-q4",  // Optional - enables update-or-create behavior
    ImportedAt:  time.Now(),
    Entries: []docuindex.DataEntry{
        {Content: "Acme Corp - $50k deal in progress..."},
        {Content: "Widget Inc - Renewal pending..."},
    },
})

// Get last import time for incremental updates
lastImport, err := store.GetLastImportTime("crm-api")
if !lastImport.IsZero() {
    // Fetch only new data from source since lastImport
}
Retrieve Documents
// Get by ID
doc, err := store.GetDocument("document-id")

// Find by external system ID (e.g., Salesforce record ID)
doc, err := store.FindByExternalID("salesforce-api", "opportunities-q4")
if doc != nil {
    fmt.Printf("Found: %s\n", doc.Info.Name)
}

// List all documents
docs, err := store.ListDocuments()
for _, info := range docs {
    fmt.Printf("%s: %s (%d pages)\n", info.ID, info.Name, info.PageCount)
}

// Delete a document
err := store.DeleteDocument("document-id")
Search Operations
results, err := store.Search("machine learning")

for _, r := range results.Results {
    fmt.Printf("Document: %s\n", r.DocumentName)
    fmt.Printf("Page: %d\n", r.Page)
    fmt.Printf("Section: %s\n", r.Section)
    fmt.Printf("Score: %.2f\n", r.Score)
    fmt.Printf("Snippet: %s\n", r.Snippet)
}

// Search with images included in results
results, err = store.Search("diagram",
    docuindex.WithImages(true),
)
for _, r := range results.Results {
    if len(r.Images) > 0 {
        fmt.Printf("Images in section: %v\n", r.Images)
        // e.g., ["images/uuid1.png", "images/uuid2.jpeg"]
    }
}
Search Modes
// Hybrid search (BM25 + vectors with RRF fusion) - default
// Falls back to keyword-only if no embedding provider is configured
results, err := store.Search("climate change impacts",
    docuindex.WithVectorWeight(0.6),   // Weight for semantic results
    docuindex.WithKeywordWeight(0.4),  // Weight for keyword results
)

// Keyword search (BM25 only)
results, err := store.Search("neural networks",
    docuindex.WithSearchMode(docuindex.SearchModeKeyword),
)

// Semantic search (vector embeddings only) - requires embedding provider
results, err := store.Search("how does machine learning work",
    docuindex.WithSearchMode(docuindex.SearchModeSemantic),
)
Search with Options
results, err := store.Search("neural networks",
    docuindex.WithMaxResults(10),           // Limit results
    docuindex.WithMinScore(0.5),            // Minimum relevance score
    docuindex.WithContextWindow(3),         // Include 3 blocks before/after
    docuindex.WithHighlight("<b>", "</b>"), // Highlight matches in snippet
)
Search Options Reference
Core Options
Option Type Default Description
WithMaxResults(n) int 100 Maximum number of results to return
WithMinScore(score) float64 0.0 Filter out results below this relevance threshold
WithContextWindow(blocks) int 2 Number of surrounding blocks to include for RAG context
WithHighlight(pre, post) string, string "", "" Markers to wrap matched terms in snippets
WithPageRange(start, end) int, int - Limit search to specific page range
WithDocuments(...ids) []string - Search only within specified document IDs
WithSections(...names) []string - Limit search to specific section names
WithImages(bool) bool false Include image paths in search results
Search Mode Options
Option Type Default Description
WithSearchMode(mode) SearchMode Hybrid Search strategy: SearchModeKeyword, SearchModeSemantic, or SearchModeHybrid
WithVectorWeight(weight) float64 0.5 Weight for semantic/vector search results (0.0-1.0)
WithKeywordWeight(weight) float64 0.5 Weight for BM25 keyword search results (0.0-1.0)
WithEfSearch(ef) int 50 HNSW search thoroughness (50=fast, 100=balanced, 200+=high recall)
Filtering Options
Option Type Default Description
WithSources(...sources) []string - Filter by source or format (e.g., "pdf", "crm", "docx")
WithTags(tags) map[string]string - Filter by tags with AND logic; use "!" prefix for negation
WithMetadata(bool) bool false Include tags, source, external_id in results
WithFilter(filter) *Filter - Advanced Filter DSL for complex queries
AI/Agent Options
Option Type Default Description
WithAgentOutput(bool) bool false Return structured AgentSearchResponse format
WithEstimateTokens(bool) bool false Include token count estimates in results
WithCitations(bool) bool false Add citation references [1], [2], etc.
WithChunking(opts) ChunkOptions - Configure result chunking for LLM context windows
Advanced Options
Option Type Default Description
WithDiversify(maxPerDoc) int 0 Limit results per document for variety (0 = unlimited)
WithDiagnostics(bool) bool false Include detailed search diagnostics in results
Understanding Vector and Keyword Weights

The WithVectorWeight and WithKeywordWeight options control how hybrid search combines semantic (vector) and keyword (BM25) results using Reciprocal Rank Fusion (RRF).

How weights work:

  • Both values should be between 0.0 and 1.0
  • They control the relative importance of each search signal
  • The weights don't need to sum to 1.0, but typically do for clarity

Practical examples:

// Balanced hybrid search (default behavior)
results, err := store.Search("machine learning concepts",
    docuindex.WithVectorWeight(0.5),
    docuindex.WithKeywordWeight(0.5),
)

// Emphasize semantic understanding - good for conceptual queries
// "How does photosynthesis work?" benefits from semantic search
results, err := store.Search("how does photosynthesis work",
    docuindex.WithVectorWeight(0.7),
    docuindex.WithKeywordWeight(0.3),
)

// Emphasize keyword matching - good for specific terms/names
// Searching for "RFC 7231" or "John Smith" benefits from keyword search
results, err := store.Search("RFC 7231",
    docuindex.WithVectorWeight(0.3),
    docuindex.WithKeywordWeight(0.7),
)

// Pure semantic search (equivalent to SearchModeSemantic)
results, err := store.Search("explain the concept",
    docuindex.WithVectorWeight(1.0),
    docuindex.WithKeywordWeight(0.0),
)

// Pure keyword search (equivalent to SearchModeKeyword)
results, err := store.Search("exact phrase",
    docuindex.WithVectorWeight(0.0),
    docuindex.WithKeywordWeight(1.0),
)

When to adjust weights:

Query Type Recommended Weights Reason
Conceptual questions Vector: 0.7, Keyword: 0.3 Semantic search understands meaning
Specific terms/IDs Vector: 0.3, Keyword: 0.7 Keywords need exact matching
Technical documentation Vector: 0.5, Keyword: 0.5 Balance both signals
Natural language Vector: 0.6, Keyword: 0.4 Slight semantic preference
Code/API references Vector: 0.2, Keyword: 0.8 Exact symbol matching important
HNSW Search Tuning with EfSearch

The WithEfSearch option controls how thoroughly the HNSW vector index is searched. Higher values improve recall (finding more relevant results) at the cost of latency.

// Fast search - good for real-time applications
results, err := store.Search("query",
    docuindex.WithEfSearch(50),
)

// Balanced - good for most use cases
results, err := store.Search("query",
    docuindex.WithEfSearch(100),
)

// High recall - good for batch processing or critical queries
results, err := store.Search("important query",
    docuindex.WithEfSearch(200),
)

EfSearch guidelines by dataset size:

Block Count Recommended EfSearch Notes
< 10,000 50 (default) Fast, high recall naturally
10,000 - 50,000 100 Good balance
50,000 - 100,000 100-150 May need tuning
> 100,000 200+ Higher values for critical searches

Note: WithEfSearch only affects the current query. To set a default for all searches, use WithHNSWConfig when creating the store:

store, _ := docuindex.NewStore("./data",
    docuindex.WithHNSWConfig(docuindex.HNSWConfig{
        EfSearch: 100,  // Default for all queries
    }),
)
Boolean and Phrase Queries
// Boolean operators: AND, OR, NOT (or +, -)
results, err := store.Search("machine learning AND neural")
results, err := store.Search("+required -excluded optional")

// Phrase matching with quotes
results, err := store.Search(`"exact phrase match"`)
Search in Specific Document
results, err := store.SearchInDocument("doc-id", "query")
Search with Source/Tag Filtering
// Search only custom data
results, err := store.Search("renewal",
    docuindex.WithSources("customdata"))

// Search by specific source
results, err := store.Search("renewal",
    docuindex.WithSources("crm-api"))

// Search with tag filter
results, err := store.Search("renewal",
    docuindex.WithTags(map[string]string{"quarter": "Q4-2024"}))

// Search with tag negation (! prefix excludes matching values)
results, err := store.Search("bugs",
    docuindex.WithTags(map[string]string{"status": "!Closed"}))  // Not Closed

// Combined filters
results, err := store.Search("enterprise",
    docuindex.WithSources("crm-api", "faq"),
    docuindex.WithTags(map[string]string{"type": "customer-notes"}))
Search with Metadata

Include document metadata (tags, source, external ID) in search results:

// Search with metadata included
results, err := store.Search("query", docuindex.WithMetadata(true))

for _, r := range results.Results {
    fmt.Printf("Document: %s\n", r.DocumentName)
    fmt.Printf("  Source: %s\n", r.Source)           // e.g., "crm-api"
    fmt.Printf("  ExternalID: %s\n", r.ExternalID)   // e.g., "salesforce-123"
    fmt.Printf("  Tags: %v\n", r.Tags)               // e.g., {"status": "Open", "priority": "High"}
}

// Combine with tag negation for filtering
results, err := store.Search("bugs",
    docuindex.WithTags(map[string]string{"status": "!Closed"}),
    docuindex.WithMetadata(true),  // Include metadata in results
)

Note: WithMetadata is disabled by default for performance. Enable it when you need to access tags, source, or external ID from search results.

Search with Diversification

When a single document contains many matching blocks, diversification limits results per document to improve variety:

// Limit to max 3 results per document
results, err := store.Search("machine learning",
    docuindex.WithDiversify(3),
)
Search Diagnostics

Enable diagnostics to understand search performance and behavior:

results, err := store.Search("query",
    docuindex.WithDiagnostics(true),
)

if results.Diagnostics != nil {
    fmt.Printf("Keyword results: %d\n", results.Diagnostics.KeywordResults)
    fmt.Printf("Vector results: %d\n", results.Diagnostics.VectorResults)
    fmt.Printf("Filtered by MinScore: %d\n", results.Diagnostics.FilteredByScore)
    fmt.Printf("Diversified from: %d results\n", results.Diagnostics.DiversifiedFrom)
    fmt.Printf("Timing - Keyword: %v, Vector: %v, Fusion: %v\n",
        results.Diagnostics.KeywordTime,
        results.Diagnostics.VectorTime,
        results.Diagnostics.FusionTime)
}
Get Context for RAG
// Get surrounding content blocks for a specific block
ctx, err := store.GetContext("doc-id", "blk_042", 5)

// ctx.Before - blocks before the target
// ctx.Center - the target block
// ctx.After - blocks after the target
Document Structure
Content Block
type ContentBlock struct {
    ID       string       // Unique block ID (e.g., "blk_001")
    Type     BlockType    // text, heading, image, list, table
    Content  string       // Text content or image path
    Page     int          // 1-indexed page number
    BBox     BoundingBox  // Position on page
    Font     *FontInfo    // Font metadata
    Semantic SemanticInfo // Heading level, section, keywords
}
Working with Blocks
doc, _ := store.GetDocument("doc-id")

// Get all text blocks
textBlocks := doc.GetTextBlocks()

// Get all image blocks
imageBlocks := doc.GetImageBlocks()

// Get blocks from a specific page
page3Blocks := doc.GetBlocksByPage(3)

// Find a specific block
block := doc.GetBlockByID("blk_042")

// Get images by document with optional filters
images, err := store.GetImagesByDocumentFiltered("doc-id", "", 0) // All images
images, err := store.GetImagesByDocumentFiltered("doc-id", "Introduction", 0) // By section
images, err := store.GetImagesByDocumentFiltered("doc-id", "", 3) // By page

// Get image info with AI-friendly description
info, err := store.GetImageInfo("image-uuid")
fmt.Printf("Image: %s - %s\n", info.OriginalName, info.Description)
// e.g., "macbook.jpg - Front view of MacBook Pro 16-inch"
Store Statistics
stats := store.Stats()
fmt.Printf("Documents: %d\n", stats.DocumentCount)
fmt.Printf("Total blocks: %d\n", stats.TotalBlocks)
fmt.Printf("Total images: %d\n", stats.TotalImages)
fmt.Printf("Index terms: %d\n", stats.IndexTerms)
fmt.Printf("Vectors: %d\n", stats.VectorCount)
Embedding Status

Check whether embeddings have been generated for a document:

// Quick check if any embeddings exist
hasEmb, err := store.HasEmbeddings(docID)
if hasEmb {
    fmt.Println("Document has embeddings")
}

// Get detailed embedding status
status, err := store.GetEmbeddingStatus(docID)
fmt.Printf("Progress: %.1f%% (%d/%d blocks)\n",
    status.Progress(), status.EmbeddedCount, status.TotalEmbeddable)

if status.IsComplete {
    fmt.Println("Fully embedded")
} else if status.HasEmbeddings {
    fmt.Println("Partially embedded")
} else {
    fmt.Println("No embeddings")
}

// EmbeddingStatus fields:
// - HasEmbeddings   bool      - true if any embeddings exist
// - IsComplete      bool      - true if all embeddable blocks have vectors
// - EmbeddedCount   int       - number of blocks with embeddings
// - TotalEmbeddable int       - number of blocks that can be embedded
// - Model           string    - embedding model used
// - Dimension       int       - vector dimension
// - LastUpdated     time.Time - when embeddings were last updated
Database Info

Get information about the database schema and library version (useful for debugging):

info, err := store.DatabaseInfo()
fmt.Printf("Schema version: %d\n", info.SchemaVersion)
fmt.Printf("Library version: %s\n", info.LibraryVersion)
fmt.Printf("Created: %s\n", info.CreatedAt)
fmt.Printf("Last migration: %s\n", info.LastMigration)

// DatabaseInfo fields:
// - SchemaVersion  int       - current database schema version
// - LibraryVersion string    - library version that created/migrated the DB
// - CreatedAt      time.Time - when database was first created
// - LastMigration  time.Time - when last schema migration was applied
Bulk Import (Large Datasets)

For importing more than 1000 records, use batch mode with deferred embedding:

// Configure store for bulk import
store, _ := docuindex.NewStore("./data",
    docuindex.WithHNSWConfig(docuindex.HNSWConfig{
        EfConst: 64, // Faster construction for bulk import
    }),
)

// Prepare data
var allData []*docuindex.CustomData
for _, record := range records {
    allData = append(allData, &docuindex.CustomData{
        Source:     "my-source",
        ExternalID: record.ID,
        Name:       record.Title,
        Entries:    []docuindex.DataEntry{{Content: record.Content}},
    })
}

// Batch index with deferred embedding
docs, err := store.IndexCustomDataBatch(allData,
    docuindex.WithDeferEmbedding(true),
)

// Generate embeddings after all documents indexed
err = store.EmbedPendingDocuments()
Incremental Sync

For ongoing synchronization from external sources:

// Get last sync timestamp
lastImport, _ := store.GetLastImportTime("my-source")

// Fetch only changed records
changedRecords := fetchRecordsModifiedSince(lastImport)

// Index in batches with deferred embedding
const batchSize = 100
for i := 0; i < len(changedRecords); i += batchSize {
    batch := changedRecords[i:min(i+batchSize, len(changedRecords))]

    for _, record := range batch {
        store.UpsertCustomData(&docuindex.CustomData{
            Source:     "my-source",
            ExternalID: record.ID,
            Name:       record.Title,
            ImportedAt: time.Now(),
            Entries:    []docuindex.DataEntry{{Content: record.Content}},
        },
            docuindex.WithDeferEmbedding(true),
        )
    }
}

// Process embeddings (resumable if interrupted)
err = store.EmbedPendingDocuments()
Resumable Embedding Maintenance

For background processing that survives restarts:

// Find documents without embeddings (scheduled task / startup)
pending, _ := store.GetDocumentsWithoutEmbeddings()
if len(pending) > 0 {
    log.Printf("Processing %d pending documents", len(pending))
    err := store.EmbedPendingDocuments()
}
Background Embedding

For non-blocking embedding that allows your application to remain responsive:

// Start background embedding (returns immediately)
err := store.EmbedPendingDocumentsAsync()
if err != nil {
    log.Fatal(err)
}

// Check progress periodically
for store.IsBackgroundRunning() {
    status := store.GetBackgroundStatus()
    fmt.Printf("Progress: %.1f%% (%d/%d documents)\n",
        status.Progress(),
        status.DocumentsDone,
        status.DocumentsTotal)
    time.Sleep(time.Second)
}

// Or block until completion
if err := store.WaitForBackground(); err != nil {
    log.Printf("Background embedding failed: %v", err)
}

// Cancel if needed (e.g., on application shutdown)
store.CancelBackground()

The BackgroundEmbeddingStatus provides detailed progress:

  • Running - whether embedding is in progress
  • DocumentsTotal / DocumentsDone - progress counters
  • CurrentDocID / CurrentDocName - currently processing document
  • ElapsedTime - time since start
  • Error - error if failed
  • Progress() - completion percentage (0-100)
HNSW Tuning by Use Case
Use Case EfConst EfSearch Notes
Small dataset (<10k) 200 50 Default - best quality
Bulk import 64 100 Faster construction
High-recall search 200 200 Slower but more accurate
Real-time indexing 100 50 Balanced
// Store-wide configuration (applied to all searches)
store, _ := docuindex.NewStore("./data",
    docuindex.WithHNSWConfig(docuindex.HNSWConfig{
        M:        16,   // Max connections (default)
        EfConst:  64,   // For bulk import
        EfSearch: 100,  // Good search quality
    }),
)

// Per-query override for recall vs latency tradeoff
results, _ := store.Search("important query",
    docuindex.WithEfSearch(200),  // Higher recall for this query
)
// Recommended: 50=fast, 100=balanced, 200+=high recall

See OPTIMISATIONS.md for detailed performance tuning guide.

Storage Architecture

DocuIndex uses a unified SQLite database for all metadata and search indices:

data/
├── docuindex.db           # SQLite database (all metadata)
├── hnsw.idx               # HNSW vector index (binary)
└── images/                # Extracted images with UUID names
    ├── a1b2c3d4-e5f6-7890-abcd-ef1234567890.png
    └── ...
Database Schema

The SQLite database contains:

  • documents - Document metadata (name, path, format, page count, timestamps)
  • content_blocks - Parsed content with position, font, and semantic info
  • search_terms - BM25 inverted index with term positions
  • document_stats - Statistics for BM25 ranking
  • vectors - Block embeddings as BLOBs
  • images - Image metadata with AI-friendly description (actual files in images/ folder)

Search Capabilities

  • Industry-standard relevance ranking
  • Boolean queries (AND, OR, NOT)
  • Phrase matching with position data
  • Porter stemming and stop word filtering
  • Heading boost (1.5x)
  • HNSW approximate nearest neighbor
  • Supports Azure OpenAI, OpenAI, Ollama
  • Block-level embeddings for granular retrieval
  • Cosine similarity distance
  • Combines BM25 + vector results
  • Reciprocal Rank Fusion (RRF) scoring
  • Configurable weights

Supported PDF Features

  • PDF 1.0 - 1.7
  • Traditional and cross-reference stream xref tables
  • Stream filters: FlateDecode, ASCIIHexDecode, ASCII85Decode, LZWDecode, RunLengthDecode
  • Font types: Type1, TrueType, Type0 (CID), Type3
  • Encoding support: WinAnsi, MacRoman, Standard, PDFDocEncoding
  • ToUnicode CMap for proper character mapping
  • Content stream operators for text positioning and graphics state
  • Embedded images (DCTDecode/JPEG, PNG)

Supported DOCX Features

  • Full ZIP archive parsing via standard library
  • XML content parsing with namespace handling
  • Style-based and font-based heading detection
  • Style inheritance chain resolution
  • Bullet and numbered list extraction
  • Table content with row/column structure
  • Inline and anchored image extraction (JPEG, PNG, GIF, BMP, TIFF)
  • Dublin Core metadata (title, author, keywords)
  • Application properties (page count, word count)
  • Field instructions (TOC, page numbers, hyperlinks)
  • Position estimation for search result context

Dependencies

  • modernc.org/sqlite - Pure Go SQLite (no CGO)
  • github.com/google/uuid - UUID generation
  • Standard library for everything else

Limitations

  • Encrypted PDFs are not supported
  • JBIG2Decode and CCITTFaxDecode PDF filters have limited support
  • DOCX position estimation is approximate (DOCX lacks exact positioning unlike PDF)
  • DOCX vector images (EMF, WMF) are detected but skipped

License

MIT License

Documentation

Index

Constants

View Source
const Version = "0.0.14"

Version is the current library version (semver)

Variables

View Source
var (
	// PDF parsing errors
	ErrInvalidPDF         = errors.New("invalid PDF file")
	ErrCorruptedPDF       = errors.New("corrupted PDF structure")
	ErrUnsupportedVersion = errors.New("unsupported PDF version")
	ErrEncryptedPDF       = errors.New("encrypted PDF not supported")

	// DOCX parsing errors
	ErrInvalidDOCX    = errors.New("invalid DOCX file")
	ErrCorruptedDOCX  = errors.New("corrupted DOCX structure")
	ErrMissingContent = errors.New("missing document.xml in DOCX")

	// Feature errors
	ErrUnsupportedFeature  = errors.New("unsupported PDF feature")
	ErrUnsupportedEncoding = errors.New("unsupported text encoding")
	ErrUnsupportedFilter   = errors.New("unsupported stream filter")
	ErrUnsupportedFont     = errors.New("unsupported font type")
	ErrUnsupportedImage    = errors.New("unsupported image format")

	// Storage errors
	ErrDocumentNotFound = errors.New("document not found")
	ErrDocumentExists   = errors.New("document already exists")
	ErrStorageCorrupted = errors.New("storage corrupted")
	ErrStorageFull      = errors.New("storage full")

	// Search errors
	ErrSearchFailed   = errors.New("search failed")
	ErrInvalidQuery   = errors.New("invalid search query")
	ErrIndexCorrupted = errors.New("search index corrupted")

	// General errors
	ErrInvalidInput = errors.New("invalid input")
	ErrIOError      = errors.New("I/O error")

	// CustomData errors
	ErrInvalidCustomData = errors.New("invalid custom data")
	ErrMissingSource     = errors.New("custom data source is required")
	ErrMissingEntries    = errors.New("custom data must have at least one entry")
)

Sentinel errors for common cases

Functions

func ChunkBlocks

func ChunkBlocks(blocks []ContentBlock, opts ChunkOptions) [][]ContentBlock

ChunkBlocks regroups content blocks based on token limits

func ChunkSearchResults

func ChunkSearchResults(results []SearchResult, maxTokens int) [][]SearchResult

ChunkSearchResults chunks search results to fit within a token budget

func CombineChunkedBlocks

func CombineChunkedBlocks(blocks []ContentBlock, separator string) string

CombineChunkedBlocks combines a group of blocks into a single string

func EstimateAgentResultTokens

func EstimateAgentResultTokens(results []AgentSearchResult) int

EstimateAgentResultTokens estimates total tokens for agent search results

func EstimateBlockTokens

func EstimateBlockTokens(block *ContentBlock) int

EstimateBlockTokens estimates tokens for a content block

func EstimateContextTokens

func EstimateContextTokens(ctx *ContextResult) int

EstimateContextTokens estimates tokens for a context window

func EstimateResultTokens

func EstimateResultTokens(results []SearchResult) int

EstimateResultTokens estimates total tokens for search results

func EstimateTokens

func EstimateTokens(text string) int

EstimateTokens estimates the token count for a given text string. This uses an approximation based on cl100k_base tokenizer behavior. For English text, it averages ~4 characters per token.

func FitsInContext

func FitsInContext(text string, maxTokens int) bool

FitsInContext checks if content fits within a token budget

func IsCustomDataError

func IsCustomDataError(err error) bool

IsCustomDataError checks if an error is a custom data error

func IsDOCXError

func IsDOCXError(err error) bool

IsDOCXError checks if an error is a DOCX error

func IsObjectError

func IsObjectError(err error) bool

IsObjectError checks if an error is a PDF object error

func IsParseError

func IsParseError(err error) bool

IsParseError checks if an error is a PDF parsing error

func IsSearchError

func IsSearchError(err error) bool

IsSearchError checks if an error is a search error

func IsStorageError

func IsStorageError(err error) bool

IsStorageError checks if an error is a storage error

func QueryTypeDescription

func QueryTypeDescription(qt QueryType) string

QueryTypeDescription returns a human-readable description of the query type

func TruncateToTokenLimit

func TruncateToTokenLimit(text string, maxTokens int) string

TruncateToTokenLimit truncates text to approximately fit within a token limit

Types

type AgentSearchResponse

type AgentSearchResponse struct {
	Query           string              `json:"query"`
	QueryType       QueryType           `json:"query_type"`
	Results         []AgentSearchResult `json:"results"`
	TotalHits       int                 `json:"total_hits"`
	SearchTime      time.Duration       `json:"search_time"`
	EstimatedTokens int                 `json:"estimated_tokens"`
	Metadata        map[string]any      `json:"metadata,omitempty"`
}

AgentSearchResponse provides AI agent-friendly search results

type AgentSearchResult

type AgentSearchResult struct {
	DocumentID   string         `json:"document_id"`
	DocumentName string         `json:"document_name"`
	BlockID      string         `json:"block_id"`
	Content      string         `json:"content"`
	Snippet      string         `json:"snippet"`
	Score        float64        `json:"score"`
	Page         int            `json:"page"`
	Section      string         `json:"section"`
	CitationRef  string         `json:"citation_ref"` // e.g., "[1]", "[2]"
	TokenCount   int            `json:"token_count"`
	Context      []ContentBlock `json:"context,omitempty"`
	Images       []string       `json:"images,omitempty"`
}

AgentSearchResult is a single result optimized for AI agent consumption

type BackgroundEmbeddingStatus added in v0.0.9

type BackgroundEmbeddingStatus struct {
	Running        bool          `json:"running"`         // Is background build in progress
	StartedAt      time.Time     `json:"started_at"`      // When build started
	DocumentsTotal int           `json:"documents_total"` // Total documents to process
	DocumentsDone  int           `json:"documents_done"`  // Documents processed so far
	CurrentDocID   string        `json:"current_doc_id"`  // Currently processing document
	CurrentDocName string        `json:"current_doc_name"`
	ElapsedTime    time.Duration `json:"elapsed_time"`
	Error          error         `json:"error,omitempty"` // Error if failed
}

BackgroundEmbeddingStatus represents the status of background HNSW building

func (BackgroundEmbeddingStatus) Progress added in v0.0.9

func (s BackgroundEmbeddingStatus) Progress() float64

Progress returns the completion percentage (0-100)

type BlockType

type BlockType string

BlockType represents the type of content block

const (
	BlockTypeText    BlockType = "text"
	BlockTypeHeading BlockType = "heading"
	BlockTypeImage   BlockType = "image"
	BlockTypeList    BlockType = "list"
	BlockTypeTable   BlockType = "table"
	BlockTypeCustom  BlockType = "custom" // Custom data entry
)

type BoundingBox

type BoundingBox struct {
	X          float64 `json:"x"`           // Left edge in points
	Y          float64 `json:"y"`           // Bottom edge in points (PDF coordinate system)
	Width      float64 `json:"width"`       // Width in points
	Height     float64 `json:"height"`      // Height in points
	PageWidth  float64 `json:"page_width"`  // Page width for relative calculations
	PageHeight float64 `json:"page_height"` // Page height for relative calculations
}

BoundingBox represents the position and size of content on a page

func (BoundingBox) RelativePosition

func (b BoundingBox) RelativePosition() (xPct, yPct, wPct, hPct float64)

RelativePosition returns position as percentages of page dimensions

type Chunk

type Chunk struct {
	Content    string `json:"content"`
	StartIdx   int    `json:"start_idx"`
	EndIdx     int    `json:"end_idx"`
	TokenCount int    `json:"token_count"`
}

Chunk represents a portion of content with token information

func ChunkContent

func ChunkContent(content string, opts ChunkOptions) []Chunk

ChunkContent splits content into LLM-friendly chunks based on the provided options

type ChunkOptions

type ChunkOptions struct {
	MaxTokens     int    `json:"max_tokens"`     // Maximum tokens per chunk (e.g., 512, 1024)
	OverlapTokens int    `json:"overlap_tokens"` // Token overlap between chunks
	ChunkBy       string `json:"chunk_by"`       // "paragraph", "sentence", "tokens"
}

ChunkOptions configures how content is chunked for LLM context windows

func DefaultChunkOptions

func DefaultChunkOptions() ChunkOptions

DefaultChunkOptions returns sensible defaults for chunking

type ContentBlock

type ContentBlock struct {
	ID       string       `json:"id"`                 // Unique block ID (e.g., "blk_001")
	Type     BlockType    `json:"type"`               // text, heading, image, etc.
	Content  string       `json:"content"`            // Text content or image path
	Page     int          `json:"page"`               // 1-indexed page number
	BBox     BoundingBox  `json:"bbox"`               // Position on page
	Font     *FontInfo    `json:"font,omitempty"`     // Font info for text
	Semantic SemanticInfo `json:"semantic,omitempty"` // AI-friendly metadata
	Children []string     `json:"children,omitempty"` // Child block IDs for hierarchy
}

ContentBlock represents a unit of content with position and metadata

type ContextResult

type ContextResult struct {
	DocumentID string         `json:"document_id"`
	CenterID   string         `json:"center_id"` // The block we're getting context for
	Before     []ContentBlock `json:"before"`    // Blocks before
	Center     ContentBlock   `json:"center"`    // The center block
	After      []ContentBlock `json:"after"`     // Blocks after
}

ContextResult contains content blocks around a specific block

type CustomData

type CustomData struct {
	Source      string            `json:"source"` // Source identifier (e.g., "crm", "faq")
	Name        string            `json:"name"`   // Display name
	Description string            `json:"description,omitempty"`
	Tags        map[string]string `json:"tags,omitempty"`        // Filter-only tags (not searched)
	Entries     []DataEntry       `json:"entries"`               // Data entries to index
	ImportedAt  time.Time         `json:"imported_at,omitempty"` // When data was imported (for incremental updates)
	ExternalID  string            `json:"external_id,omitempty"` // Unique ID from source system (for upsert)
	Images      []CustomImage     `json:"images,omitempty"`      // Document-level images (not tied to specific entry)
}

CustomData represents structured data to be indexed

type CustomDataError

type CustomDataError struct {
	Source  string // Source identifier
	Message string // What went wrong
	Err     error  // Underlying error
}

CustomDataError indicates a custom data processing error

func NewCustomDataError

func NewCustomDataError(source, message string, err error) *CustomDataError

NewCustomDataError creates a new CustomDataError

func (*CustomDataError) Error

func (e *CustomDataError) Error() string

func (*CustomDataError) Unwrap

func (e *CustomDataError) Unwrap() error

type CustomImage added in v0.0.5

type CustomImage struct {
	Data         []byte `json:"-"`                       // Image bytes (required, excluded from JSON)
	Format       string `json:"format"`                  // "png", "jpeg", "gif", "bmp" (required)
	Width        int    `json:"width,omitempty"`         // Optional, auto-detected if not provided
	Height       int    `json:"height,omitempty"`        // Optional, auto-detected if not provided
	OriginalName string `json:"original_name,omitempty"` // Optional display name
	Description  string `json:"description,omitempty"`   // AI-friendly alt text/description
}

CustomImage represents an image to be indexed with custom data

type DOCXError

type DOCXError struct {
	Part    string // Which part of the DOCX (e.g., "word/document.xml")
	Message string // What went wrong
	Err     error  // Underlying error
}

DOCXError indicates a DOCX parsing or processing error

func NewDOCXError

func NewDOCXError(part, message string, err error) *DOCXError

NewDOCXError creates a new DOCXError

func (*DOCXError) Error

func (e *DOCXError) Error() string

func (*DOCXError) Unwrap

func (e *DOCXError) Unwrap() error

type DataEntry

type DataEntry struct {
	ID       string            `json:"id,omitempty"`       // Optional, auto-generated if empty
	Content  string            `json:"content"`            // Text content to index/embed
	Type     string            `json:"type,omitempty"`     // "text" (default), "json", "code"
	Metadata map[string]string `json:"metadata,omitempty"` // Entry-specific metadata
	Images   []CustomImage     `json:"images,omitempty"`   // Images associated with this entry
}

DataEntry represents a single entry in custom data

type DatabaseInfo added in v0.0.5

type DatabaseInfo struct {
	SchemaVersion  int       `json:"schema_version"`  // Current schema version
	LibraryVersion string    `json:"library_version"` // Library version that created/migrated DB
	CreatedAt      time.Time `json:"created_at"`      // When database was created
	LastMigration  time.Time `json:"last_migration"`  // When last migration was applied
}

DatabaseInfo contains information about the database schema and version

type DateRange

type DateRange struct {
	Start time.Time `json:"start"`
	End   time.Time `json:"end"`
}

DateRange represents a time range for filtering

type DedupResult

type DedupResult struct {
	IsDuplicate  bool    `json:"is_duplicate"`
	ExistingID   string  `json:"existing_id,omitempty"`
	ExistingName string  `json:"existing_name,omitempty"`
	Similarity   float64 `json:"similarity"`
	Method       string  `json:"method"` // "checksum", "content_hash", "embedding"
}

DedupResult contains information about duplicate detection

type Document

type Document struct {
	Info    DocumentInfo    `json:"info"`
	Content DocumentContent `json:"content"`
}

Document represents a fully indexed document

func (*Document) GetBlockByID

func (d *Document) GetBlockByID(id string) *ContentBlock

GetBlockByID finds a block by its ID

func (*Document) GetBlocksByPage

func (d *Document) GetBlocksByPage(page int) []ContentBlock

GetBlocksByPage returns blocks for a specific page

func (*Document) GetImageBlocks

func (d *Document) GetImageBlocks() []ContentBlock

GetImageBlocks returns only image-type blocks

func (*Document) GetTextBlocks

func (d *Document) GetTextBlocks() []ContentBlock

GetTextBlocks returns only text-type blocks

type DocumentContent

type DocumentContent struct {
	Version string         `json:"version"` // Schema version
	Blocks  []ContentBlock `json:"blocks"`  // All content blocks
}

DocumentContent holds the structured content of a document

type DocumentFormat

type DocumentFormat string

DocumentFormat represents the source document format

const (
	FormatPDF        DocumentFormat = "pdf"
	FormatDOCX       DocumentFormat = "docx"
	FormatCustomData DocumentFormat = "customdata" // Custom data source
)

type DocumentInfo

type DocumentInfo struct {
	ID           string         `json:"id"`                    // UUID
	Name         string         `json:"name"`                  // Original filename
	OriginalPath string         `json:"original_path"`         // Path when indexed
	SizeBytes    int64          `json:"size_bytes"`            // File size
	PageCount    int            `json:"page_count"`            // Number of pages
	Format       DocumentFormat `json:"format"`                // pdf, docx, customdata
	Checksum     string         `json:"checksum"`              // SHA-256 hash
	CreatedAt    time.Time      `json:"created_at"`            // When indexed
	UpdatedAt    time.Time      `json:"updated_at"`            // Last update
	Source       string         `json:"source,omitempty"`      // CustomData source identifier
	Description  string         `json:"description,omitempty"` // CustomData description
	ImportedAt   time.Time      `json:"imported_at,omitempty"` // CustomData import timestamp
	ExternalID   string         `json:"external_id,omitempty"` // External identifier for upsert
}

DocumentInfo contains metadata about an indexed document

type EmbeddingStatus

type EmbeddingStatus struct {
	HasEmbeddings   bool      `json:"has_embeddings"`         // True if any embeddings exist
	IsComplete      bool      `json:"is_complete"`            // True if all embeddable blocks have vectors
	EmbeddedCount   int       `json:"embedded_count"`         // Number of blocks with embeddings
	TotalEmbeddable int       `json:"total_embeddable"`       // Number of blocks that can be embedded
	Model           string    `json:"model,omitempty"`        // Embedding model used
	Dimension       int       `json:"dimension,omitempty"`    // Vector dimension
	LastUpdated     time.Time `json:"last_updated,omitempty"` // When embeddings were last updated
}

EmbeddingStatus contains information about a document's embedding state

func (*EmbeddingStatus) Progress

func (e *EmbeddingStatus) Progress() float64

Progress returns embedding completion as a percentage (0-100)

type Filter

type Filter struct {
	// contains filtered or unexported fields
}

Filter provides a fluent API for building search filters

func NewFilter

func NewFilter() *Filter

NewFilter creates a new empty filter

func (*Filter) After

func (f *Filter) After(t time.Time) *Filter

After filters documents created/imported after the given time

func (*Filter) Before

func (f *Filter) Before(t time.Time) *Filter

Before filters documents created/imported before the given time

func (*Filter) Build

func (f *Filter) Build() *FilterConfig

Build converts the Filter to FilterConfig for internal use

func (*Filter) DateRange

func (f *Filter) DateRange(start, end time.Time) *Filter

DateRange filters documents created/imported within a time range

func (*Filter) ExternalIDs

func (f *Filter) ExternalIDs(ids ...string) *Filter

ExternalIDs filters by external identifiers

func (*Filter) Formats

func (f *Filter) Formats(formats ...string) *Filter

Formats filters by document format (e.g., "pdf", "docx", "customdata")

func (*Filter) GetDateRange

func (f *Filter) GetDateRange() *DateRange

GetDateRange returns the date range filter

func (*Filter) GetExternalIDs

func (f *Filter) GetExternalIDs() []string

GetExternalIDs returns the external ID filters

func (*Filter) GetFormats

func (f *Filter) GetFormats() []string

GetFormats returns the format filters

func (*Filter) GetHasEmbeddings

func (f *Filter) GetHasEmbeddings() *bool

GetHasEmbeddings returns the embeddings filter

func (*Filter) GetMaxPageCount

func (f *Filter) GetMaxPageCount() int

GetMaxPageCount returns the maximum page count filter

func (*Filter) GetMinPageCount

func (f *Filter) GetMinPageCount() int

GetMinPageCount returns the minimum page count filter

func (*Filter) GetSources

func (f *Filter) GetSources() []string

GetSources returns the source filters

func (*Filter) GetTags

func (f *Filter) GetTags() map[string]string

GetTags returns the tag filters

func (*Filter) HasEmbeddings

func (f *Filter) HasEmbeddings(has bool) *Filter

HasEmbeddings filters documents that have (or don't have) embeddings

func (*Filter) IsEmpty

func (f *Filter) IsEmpty() bool

IsEmpty returns true if no filters are set

func (*Filter) MaxPages

func (f *Filter) MaxPages(n int) *Filter

MaxPages filters documents with at most n pages

func (*Filter) MinPages

func (f *Filter) MinPages(n int) *Filter

MinPages filters documents with at least n pages

func (*Filter) Sources

func (f *Filter) Sources(sources ...string) *Filter

Sources filters by source identifiers (e.g., "crm", "faq")

func (*Filter) Tag

func (f *Filter) Tag(key, value string) *Filter

Tag adds a single tag filter

func (*Filter) Tags

func (f *Filter) Tags(tags map[string]string) *Filter

Tags filters by multiple tags (AND logic - all must match)

type FilterConfig

type FilterConfig struct {
	Sources       []string
	Formats       []string
	Tags          map[string]string
	DateStart     time.Time
	DateEnd       time.Time
	MinPageCount  int
	MaxPageCount  int
	HasEmbeddings *bool
	ExternalIDs   []string
}

FilterConfig is the internal representation used by search

type FontError

type FontError struct {
	FontName string // Font name
	Message  string // What went wrong
	Err      error  // Underlying error
}

FontError indicates an error processing a font

func NewFontError

func NewFontError(fontName, message string, err error) *FontError

NewFontError creates a new FontError

func (*FontError) Error

func (e *FontError) Error() string

func (*FontError) Unwrap

func (e *FontError) Unwrap() error

type FontInfo

type FontInfo struct {
	Name   string  `json:"name"`             // Font name (e.g., "Helvetica-Bold")
	Size   float64 `json:"size"`             // Font size in points
	Bold   bool    `json:"bold,omitempty"`   // Is bold
	Italic bool    `json:"italic,omitempty"` // Is italic
}

FontInfo contains font metadata for text content

type HNSWConfig added in v0.0.6

type HNSWConfig struct {
	M        int // Max connections per layer (default: 16, range: 4-64)
	EfConst  int // Construction ef parameter (default: 200, range: 10-500)
	EfSearch int // Search ef parameter (default: 50, range: 10-500)
}

HNSWConfig configures the HNSW vector index parameters

type ImageInfo

type ImageInfo struct {
	ID           string `json:"id"`                      // Image UUID
	DocumentID   string `json:"document_id,omitempty"`   // Parent document ID
	BlockID      string `json:"block_id,omitempty"`      // Associated content block ID
	Format       string `json:"format"`                  // png, jpeg, etc.
	Width        int    `json:"width"`                   // Image width in pixels
	Height       int    `json:"height"`                  // Image height in pixels
	Page         int    `json:"page"`                    // Page number
	OriginalName string `json:"original_name,omitempty"` // Original image name from PDF/DOCX
}

ImageInfo contains metadata about an extracted image

type IndexOption

type IndexOption func(*indexConfig)

IndexOption configures indexing behavior

func WithDeferEmbedding added in v0.0.6

func WithDeferEmbedding(defer_ bool) IndexOption

WithDeferEmbedding skips embedding generation during indexing. Use store.EmbedPendingDocuments() or store.EmbedDocuments() to generate embeddings later in a batch operation. This is recommended for bulk imports. See OPTIMISATIONS.md for recommended patterns.

func WithForceReindex

func WithForceReindex(force bool) IndexOption

WithForceReindex forces re-indexing even if document exists

func WithIndexSource added in v0.0.13

func WithIndexSource(source string) IndexOption

WithIndexSource sets a custom source identifier for the document. This overrides the default format-based source ("pdf" or "docx"). Use this for logical categorization like "knowledgebase", "manual", etc.

func WithIndexTags added in v0.0.13

func WithIndexTags(tags map[string]string) IndexOption

WithIndexTags sets metadata tags for the document. Tags enable filtering in search queries via WithTags() search option.

func WithName

func WithName(name string) IndexOption

WithName overrides the document name

func WithProgressCallback

func WithProgressCallback(fn ProgressCallback) IndexOption

WithProgressCallback sets a callback for progress updates during indexing

func WithSourcePath

func WithSourcePath(path string) IndexOption

WithSourcePath sets the original source path for metadata

type IndexProgress

type IndexProgress struct {
	DocumentID      string        `json:"document_id"`
	DocumentName    string        `json:"document_name"`
	Status          string        `json:"status"` // "parsing", "extracting", "indexing", "embedding", "complete", "error"
	TotalPages      int           `json:"total_pages"`
	ProcessedPages  int           `json:"processed_pages"`
	TotalBlocks     int           `json:"total_blocks"`
	ProcessedBlocks int           `json:"processed_blocks"`
	Error           error         `json:"error,omitempty"`
	StartTime       time.Time     `json:"start_time"`
	ElapsedTime     time.Duration `json:"elapsed_time"`
}

IndexProgress reports progress during document indexing

type ObjectError

type ObjectError struct {
	ObjectNum int    // Object number
	GenNum    int    // Generation number
	Message   string // What went wrong
	Err       error  // Underlying error
}

ObjectError indicates an error with a specific PDF object

func NewObjectError

func NewObjectError(objNum, genNum int, message string, err error) *ObjectError

NewObjectError creates a new ObjectError

func (*ObjectError) Error

func (e *ObjectError) Error() string

func (*ObjectError) Unwrap

func (e *ObjectError) Unwrap() error

type Page

type Page struct {
	Number int            `json:"number"`
	Width  float64        `json:"width"`
	Height float64        `json:"height"`
	Blocks []ContentBlock `json:"blocks"`
}

Page represents a single page with its content

type PageError

type PageError struct {
	PageNum int    // 1-indexed page number
	Message string // What went wrong
	Err     error  // Underlying error
}

PageError indicates an error processing a specific page

func NewPageError

func NewPageError(pageNum int, message string, err error) *PageError

NewPageError creates a new PageError

func (*PageError) Error

func (e *PageError) Error() string

func (*PageError) Unwrap

func (e *PageError) Unwrap() error

type ParseError

type ParseError struct {
	Op      string // Operation that failed (e.g., "lexer.readToken")
	Offset  int64  // Byte offset in file where error occurred
	Message string // Human-readable message
	Err     error  // Underlying error
}

ParseError provides detailed information about PDF parsing failures

func NewParseError

func NewParseError(op string, offset int64, message string, err error) *ParseError

NewParseError creates a new ParseError

func (*ParseError) Error

func (e *ParseError) Error() string

func (*ParseError) Unwrap

func (e *ParseError) Unwrap() error

type Posting

type Posting struct {
	DocumentID string  `json:"doc_id"`
	BlockID    string  `json:"block_id"`
	Positions  []int   `json:"positions"` // Positions within the block text
	TF         float64 `json:"tf"`        // Term frequency for this posting
}

Posting represents a term occurrence in the index

type ProgressCallback

type ProgressCallback func(IndexProgress)

ProgressCallback is called during document indexing to report progress

type QueryType

type QueryType string

QueryType represents the detected intent of a search query

const (
	// QueryTypeFactual for questions like "What is X?", "Who is Y?"
	QueryTypeFactual QueryType = "factual"
	// QueryTypeNavigation for "Show me section...", "Find..."
	QueryTypeNavigation QueryType = "navigation"
	// QueryTypeSummary for "Summarize...", "Overview of..."
	QueryTypeSummary QueryType = "summary"
	// QueryTypeComparison for "Compare X and Y", "Difference between..."
	QueryTypeComparison QueryType = "comparison"
	// QueryTypeDefinition for "Define X", "What is the definition of..."
	QueryTypeDefinition QueryType = "definition"
	// QueryTypeList for "List all X", "Enumerate...", "What are all..."
	QueryTypeList QueryType = "list"
	// QueryTypeUnknown when intent cannot be determined
	QueryTypeUnknown QueryType = "unknown"
)

func DetectQueryType

func DetectQueryType(query string) QueryType

DetectQueryType analyzes a search query and returns its detected intent

type SearchDiagnostics added in v0.0.14

type SearchDiagnostics struct {
	KeywordResults  int           `json:"keyword_results"`   // Results from BM25 keyword search
	VectorResults   int           `json:"vector_results"`    // Results from vector/semantic search
	KeywordTime     time.Duration `json:"keyword_time"`      // Time spent on keyword search
	VectorTime      time.Duration `json:"vector_time"`       // Time spent on vector search
	FusionTime      time.Duration `json:"fusion_time"`       // Time spent fusing results
	FilteredByScore int           `json:"filtered_by_score"` // Results filtered by MinScore
	DiversifiedFrom int           `json:"diversified_from"`  // Results before diversification (0 if not applied)
}

SearchDiagnostics provides detailed information about how search was executed. Useful for debugging and optimizing search performance.

type SearchError

type SearchError struct {
	Query   string // The search query
	Message string // What went wrong
	Err     error  // Underlying error
}

SearchError indicates a search operation failure

func NewSearchError

func NewSearchError(query, message string, err error) *SearchError

NewSearchError creates a new SearchError

func (*SearchError) Error

func (e *SearchError) Error() string

func (*SearchError) Unwrap

func (e *SearchError) Unwrap() error

type SearchMode

type SearchMode string

SearchMode defines the type of search

const (
	// SearchModeKeyword uses BM25 keyword search only
	SearchModeKeyword SearchMode = "keyword"
	// SearchModeSemantic uses vector similarity search only
	SearchModeSemantic SearchMode = "semantic"
	// SearchModeHybrid combines BM25 and vector search with RRF fusion
	SearchModeHybrid SearchMode = "hybrid"
)

func SuggestedSearchMode

func SuggestedSearchMode(qt QueryType) SearchMode

SuggestedSearchMode returns the recommended search mode for a query type

type SearchOption

type SearchOption func(*searchConfig)

SearchOption configures search behavior

func WithAgentOutput

func WithAgentOutput(enabled bool) SearchOption

WithAgentOutput enables agent-friendly output format

func WithChunking

func WithChunking(opts ChunkOptions) SearchOption

WithChunking configures result chunking for LLM context windows

func WithCitations

func WithCitations(enabled bool) SearchOption

WithCitations adds citation references [1], [2], etc. to results

func WithContextWindow

func WithContextWindow(blocks int) SearchOption

WithContextWindow sets the number of surrounding blocks to include

func WithDiagnostics added in v0.0.14

func WithDiagnostics(enabled bool) SearchOption

WithDiagnostics enables detailed search diagnostics in the results. Diagnostics include timing breakdowns, result counts per search type, and filtering stats. Useful for debugging and optimizing search performance.

func WithDiversify added in v0.0.14

func WithDiversify(maxPerDoc int) SearchOption

WithDiversify limits the number of results per document to improve variety. This is useful when multiple blocks from the same document match the query. Set maxPerDoc to 0 (default) for unlimited results per document.

func WithDocuments

func WithDocuments(docIDs ...string) SearchOption

WithDocuments limits search to specific documents

func WithEfSearch added in v0.0.11

func WithEfSearch(ef int) SearchOption

WithEfSearch overrides the HNSW efSearch parameter for this query. Higher values improve recall at the cost of latency. If ef <= 0 (default), the store's configured EfSearch value is used. Recommended: 50 for speed, 100 for balanced, 200+ for high recall.

func WithEstimateTokens

func WithEstimateTokens(enabled bool) SearchOption

WithEstimateTokens includes token count estimates in results

func WithFilter

func WithFilter(f *Filter) SearchOption

WithFilter applies an advanced filter DSL

func WithHighlight

func WithHighlight(pre, post string) SearchOption

WithHighlight sets the highlight markers for matched terms

func WithImages

func WithImages(include bool) SearchOption

WithImages includes image blocks in search results

func WithKeywordWeight

func WithKeywordWeight(weight float64) SearchOption

WithKeywordWeight sets the weight for keyword search in hybrid mode (0-1)

func WithMaxResults

func WithMaxResults(n int) SearchOption

WithMaxResults sets the maximum number of results

func WithMetadata added in v0.0.12

func WithMetadata(include bool) SearchOption

WithMetadata includes document metadata (tags, source, external ID) in search results. This adds extra database queries per result, so it's disabled by default for performance.

func WithMinScore

func WithMinScore(score float64) SearchOption

WithMinScore sets the minimum relevance score threshold

func WithPageRange

func WithPageRange(start, end int) SearchOption

WithPageRange limits search to a specific page range

func WithSearchMode

func WithSearchMode(mode SearchMode) SearchOption

WithSearchMode sets the search mode (keyword, semantic, or hybrid)

func WithSections

func WithSections(sections ...string) SearchOption

WithSections limits search to specific sections

func WithSources

func WithSources(sources ...string) SearchOption

WithSources filters search results by source or format (e.g., "pdf", "docx", "crm")

func WithTags

func WithTags(tags map[string]string) SearchOption

WithTags filters search results by tags (AND logic - all must match)

func WithVectorWeight

func WithVectorWeight(weight float64) SearchOption

WithVectorWeight sets the weight for vector search in hybrid mode (0-1)

type SearchResult

type SearchResult struct {
	DocumentID   string            `json:"document_id"`
	DocumentName string            `json:"document_name"`
	BlockID      string            `json:"block_id"`
	Content      string            `json:"content"`               // Matched content
	Snippet      string            `json:"snippet"`               // Highlighted snippet
	Score        float64           `json:"score"`                 // Relevance score
	Page         int               `json:"page"`                  // Page number
	Section      string            `json:"section"`               // Section name
	Context      []ContentBlock    `json:"context"`               // Surrounding blocks for RAG
	Positions    []int             `json:"positions"`             // Match positions in content
	Images       []string          `json:"images,omitempty"`      // Image paths in same section
	Tags         map[string]string `json:"tags,omitempty"`        // Document tags
	Source       string            `json:"source,omitempty"`      // Source identifier (for CustomData)
	ExternalID   string            `json:"external_id,omitempty"` // External system ID
}

SearchResult represents a single search hit

type SearchResults

type SearchResults struct {
	Query       string             `json:"query"`
	TotalHits   int                `json:"total_hits"`
	Results     []SearchResult     `json:"results"`
	SearchTime  time.Duration      `json:"search_time"`
	Diagnostics *SearchDiagnostics `json:"diagnostics,omitempty"`
}

SearchResults contains search results with metadata

type SemanticInfo

type SemanticInfo struct {
	IsHeading    bool     `json:"is_heading,omitempty"`
	HeadingLevel int      `json:"heading_level,omitempty"` // 1-6 like HTML
	Section      string   `json:"section,omitempty"`       // Parent section title
	Keywords     []string `json:"keywords,omitempty"`      // Extracted keywords
	Context      string   `json:"context,omitempty"`       // Surrounding context summary
}

SemanticInfo contains AI-friendly metadata about content

type StorageError

type StorageError struct {
	Op      string // Operation (e.g., "write", "read", "delete")
	Path    string // File or directory path
	Message string // What went wrong
	Err     error  // Underlying error
}

StorageError indicates a storage operation failure

func NewStorageError

func NewStorageError(op, path, message string, err error) *StorageError

NewStorageError creates a new StorageError

func (*StorageError) Error

func (e *StorageError) Error() string

func (*StorageError) Unwrap

func (e *StorageError) Unwrap() error

type Store

type Store struct {
	// contains filtered or unexported fields
}

Store manages document storage, indexing, and search with unified SQLite backend

func NewStore

func NewStore(basePath string, opts ...StoreOption) (*Store, error)

NewStore creates a new document store at the specified path using SQLite

func (*Store) CancelBackground added in v0.0.9

func (s *Store) CancelBackground()

CancelBackground cancels background embedding if running. The operation will stop after the current document completes.

func (*Store) CheckDuplicate

func (s *Store) CheckDuplicate(path string) (*DedupResult, error)

CheckDuplicate checks if a document at the given path is a duplicate of an existing document. It uses file checksum for comparison.

func (*Store) CheckDuplicateByContent

func (s *Store) CheckDuplicateByContent(data []byte) (*DedupResult, error)

CheckDuplicateByContent checks if content already exists in the store. It computes a content hash from the provided data and checks for matches.

func (*Store) CheckHealth added in v0.0.8

func (s *Store) CheckHealth() (*StoreHealth, error)

CheckHealth performs a comprehensive consistency check on the store. It checks for HNSW-SQLite synchronization, incomplete embeddings, and other issues.

func (*Store) Close

func (s *Store) Close() error

Close releases resources held by the store

func (*Store) DatabaseInfo added in v0.0.5

func (s *Store) DatabaseInfo() (*DatabaseInfo, error)

DatabaseInfo returns information about the database schema and version

func (*Store) DeleteDocument

func (s *Store) DeleteDocument(id string) error

DeleteDocument removes a document from the store

func (*Store) DetectQueryType

func (s *Store) DetectQueryType(query string) QueryType

DetectQueryType analyzes a query string and returns its detected intent type

func (*Store) EmbedDocuments added in v0.0.6

func (s *Store) EmbedDocuments(docIDs ...string) error

EmbedDocuments generates embeddings for specific documents by ID. This is useful for resumable batch processing. OPTIMIZED: Saves HNSW index only once at the end instead of after each document.

func (*Store) EmbedPendingDocuments added in v0.0.6

func (s *Store) EmbedPendingDocuments() error

EmbedPendingDocuments generates embeddings for all documents that don't have them yet. This is the main method for deferred embedding patterns. OPTIMIZED: Saves HNSW index only once at the end instead of after each document.

func (*Store) EmbedPendingDocumentsAsync added in v0.0.9

func (s *Store) EmbedPendingDocumentsAsync() error

EmbedPendingDocumentsAsync starts embedding in background. Returns immediately. Use GetBackgroundStatus() to check progress, IsBackgroundRunning() to check if still running, or WaitForBackground() to block.

func (*Store) FindByExternalID added in v0.0.11

func (s *Store) FindByExternalID(source, externalID string) (*Document, error)

FindByExternalID finds a document by source and external ID

func (*Store) GetBackgroundStatus added in v0.0.9

func (s *Store) GetBackgroundStatus() BackgroundEmbeddingStatus

GetBackgroundStatus returns the current status of background embedding. Safe to call even when no background operation is running.

func (*Store) GetContext

func (s *Store) GetContext(docID, blockID string, windowSize int) (*ContextResult, error)

GetContext retrieves content blocks around a specific block

func (*Store) GetDocument

func (s *Store) GetDocument(id string) (*Document, error)

GetDocument retrieves a document by ID

func (*Store) GetDocumentsWithIncompleteEmbeddings added in v0.0.8

func (s *Store) GetDocumentsWithIncompleteEmbeddings() ([]*DocumentInfo, error)

GetDocumentsWithIncompleteEmbeddings returns documents that have some but not all blocks embedded. This identifies documents where embedding was interrupted mid-way and need recovery.

func (*Store) GetDocumentsWithoutEmbeddings added in v0.0.6

func (s *Store) GetDocumentsWithoutEmbeddings() ([]*DocumentInfo, error)

GetDocumentsWithoutEmbeddings returns documents that have embeddable content but don't have any embeddings yet. Use this for resumable maintenance tasks.

func (*Store) GetEmbeddingStatus

func (s *Store) GetEmbeddingStatus(docID string) (*EmbeddingStatus, error)

GetEmbeddingStatus returns the embedding status for a document

func (*Store) GetImagesByDocumentFiltered

func (s *Store) GetImagesByDocumentFiltered(docID, section string, page int) ([]ImageInfo, error)

GetImagesByDocumentFiltered returns images for a document with optional section/page filters

func (*Store) GetLastImportTime

func (s *Store) GetLastImportTime(source string) (time.Time, error)

GetLastImportTime returns the most recent import timestamp for a given source Returns zero time if no imports found for the source

func (*Store) HasEmbeddings

func (s *Store) HasEmbeddings(docID string) (bool, error)

HasEmbeddings is a convenience method that returns true if a document has any embeddings

func (*Store) IndexCustomData

func (s *Store) IndexCustomData(data *CustomData, opts ...IndexOption) (*Document, error)

IndexCustomData indexes custom structured data

func (*Store) IndexCustomDataBatch added in v0.0.6

func (s *Store) IndexCustomDataBatch(data []*CustomData, opts ...IndexOption) ([]*Document, error)

IndexCustomDataBatch indexes multiple custom data entries efficiently. This is optimized for bulk imports with deferred global stats updates.

func (*Store) IndexDocument

func (s *Store) IndexDocument(path string, opts ...IndexOption) (*Document, error)

IndexDocument indexes a document from a file path

func (*Store) IndexDocumentWithProgress

func (s *Store) IndexDocumentWithProgress(path string, callback ProgressCallback, opts ...IndexOption) (*Document, error)

IndexDocumentWithProgress indexes a document with progress callbacks

func (*Store) IndexReader

func (s *Store) IndexReader(r io.Reader, name string, opts ...IndexOption) (*Document, error)

IndexReader indexes a document from an io.Reader

func (*Store) IsBackgroundRunning added in v0.0.9

func (s *Store) IsBackgroundRunning() bool

IsBackgroundRunning returns true if background embedding is in progress.

func (*Store) ListDocuments

func (s *Store) ListDocuments() ([]*DocumentInfo, error)

ListDocuments returns all indexed documents

func (*Store) Repair added in v0.0.8

func (s *Store) Repair() error

Repair fixes detected inconsistencies in the store. It rebuilds the HNSW index from SQLite and resumes incomplete embeddings. Returns nil if no repairs were needed or all repairs succeeded.

func (*Store) ResumeAllIncompleteEmbeddings added in v0.0.8

func (s *Store) ResumeAllIncompleteEmbeddings() error

ResumeAllIncompleteEmbeddings resumes embedding for all documents with incomplete embeddings. This is useful for recovering from crashes or interruptions during batch embedding.

func (*Store) ResumeEmbedding added in v0.0.8

func (s *Store) ResumeEmbedding(docID string) error

ResumeEmbedding continues embedding for a document that was partially embedded. Only embeds blocks that don't already have vectors, making it safe to call on documents that were interrupted during embedding.

func (*Store) Search

func (s *Store) Search(query string, opts ...SearchOption) (*SearchResults, error)

Search performs a search across all documents

func (*Store) SearchForAgent

func (s *Store) SearchForAgent(query string, opts ...SearchOption) (*AgentSearchResponse, error)

SearchForAgent performs a search optimized for AI agent consumption. Returns structured output with token estimates, citation references, and chunked results.

func (*Store) SearchInDocument

func (s *Store) SearchInDocument(docID, query string, opts ...SearchOption) (*SearchResults, error)

SearchInDocument searches within a specific document

func (*Store) SetEmbeddingProvider

func (s *Store) SetEmbeddingProvider(provider embedding.Provider) error

SetEmbeddingProvider configures the embedding provider after store creation. It automatically detects and repairs inconsistencies between the HNSW index and SQLite vectors, rebuilding the index if necessary.

func (*Store) Stats

func (s *Store) Stats() StoreStats

Stats returns statistics about the store

func (*Store) UpsertCustomData

func (s *Store) UpsertCustomData(data *CustomData, opts ...IndexOption) (*Document, error)

UpsertCustomData updates an existing document or creates a new one based on source + external_id. If ExternalID is provided and a document with the same source + external_id exists, it will be updated. If ExternalID is empty or no matching document exists, a new document is created.

func (*Store) WaitForBackground added in v0.0.9

func (s *Store) WaitForBackground() error

WaitForBackground blocks until background embedding completes. Returns the error from background embedding if any, or nil if successful. Returns nil immediately if no background operation is running.

type StoreHealth added in v0.0.8

type StoreHealth struct {
	IsHealthy            bool     `json:"is_healthy"`            // True if all checks pass
	HNSWSize             int      `json:"hnsw_size"`             // Number of vectors in HNSW index
	SQLiteVectorCount    int      `json:"sqlite_vector_count"`   // Number of vectors in SQLite
	HNSWSynced           bool     `json:"hnsw_synced"`           // True if HNSW matches SQLite
	IncompleteEmbeddings []string `json:"incomplete_embeddings"` // Document IDs with partial embeddings
	PendingEmbeddings    []string `json:"pending_embeddings"`    // Document IDs without any embeddings
	DocumentCount        int      `json:"document_count"`        // Total number of documents
	BlockCount           int      `json:"block_count"`           // Total number of content blocks
}

StoreHealth contains consistency check results for diagnosing store issues

type StoreOption

type StoreOption func(*storeConfig)

StoreOption configures Store behavior

func WithCache

func WithCache(enabled bool, size int) StoreOption

WithCache enables/disables object caching

func WithChecksum

func WithChecksum(enabled bool) StoreOption

WithChecksum enables/disables document checksum computation

func WithDedupCheck

func WithDedupCheck(enabled bool) StoreOption

WithDedupCheck enables duplicate detection before indexing

func WithHNSWConfig added in v0.0.6

func WithHNSWConfig(cfg HNSWConfig) StoreOption

WithHNSWConfig configures the HNSW vector index parameters. Use this to tune performance vs quality trade-offs for bulk imports. See OPTIMISATIONS.md for recommended settings by dataset size.

func WithImageExtraction

func WithImageExtraction(enabled bool) StoreOption

WithImageExtraction enables/disables image extraction

func WithMaxConcurrency

func WithMaxConcurrency(n int) StoreOption

WithMaxConcurrency sets the maximum concurrent operations

func WithNGrams

func WithNGrams(enabled bool, size int) StoreOption

WithNGrams enables n-gram indexing for fuzzy search

func WithSemanticAnalysis

func WithSemanticAnalysis(enabled bool) StoreOption

WithSemanticAnalysis enables/disables semantic analysis

func WithStemming

func WithStemming(enabled bool) StoreOption

WithStemming enables/disables Porter stemming in search

func WithStopWords

func WithStopWords(enabled bool) StoreOption

WithStopWords enables/disables stop word filtering

type StoreStats

type StoreStats struct {
	DocumentCount int   `json:"document_count"`
	TotalBlocks   int   `json:"total_blocks"`
	TotalImages   int   `json:"total_images"`
	IndexTerms    int   `json:"index_terms"`
	VectorCount   int   `json:"vector_count"`
	StorageBytes  int64 `json:"storage_bytes"`
}

StoreStats contains statistics about the store

type StreamError

type StreamError struct {
	Filter  string // Filter name (e.g., "FlateDecode")
	Message string // What went wrong
	Err     error  // Underlying error
}

StreamError indicates an error decoding a stream

func NewStreamError

func NewStreamError(filter, message string, err error) *StreamError

NewStreamError creates a new StreamError

func (*StreamError) Error

func (e *StreamError) Error() string

func (*StreamError) Unwrap

func (e *StreamError) Unwrap() error

type TermEntry

type TermEntry struct {
	Term     string    `json:"term"`
	DF       int       `json:"df"`       // Document frequency
	Postings []Posting `json:"postings"` // All occurrences
}

TermEntry contains all postings for a term

type TokenBudget

type TokenBudget struct {
	MaxTokens  int
	UsedTokens int
}

TokenBudget helps track token usage across multiple operations

func NewTokenBudget

func NewTokenBudget(maxTokens int) *TokenBudget

NewTokenBudget creates a new token budget tracker

func (*TokenBudget) Add

func (b *TokenBudget) Add(tokens int) bool

Add adds tokens to the budget, returns true if within budget

func (*TokenBudget) AddText

func (b *TokenBudget) AddText(text string) bool

AddText estimates and adds tokens for text, returns true if within budget

func (*TokenBudget) IsExhausted

func (b *TokenBudget) IsExhausted() bool

IsExhausted returns true if budget is exhausted

func (*TokenBudget) Remaining

func (b *TokenBudget) Remaining() int

Remaining returns remaining token budget

func (*TokenBudget) Reset

func (b *TokenBudget) Reset()

Reset resets the budget to zero usage

func (*TokenBudget) Usage

func (b *TokenBudget) Usage() float64

Usage returns the current usage percentage (0-100)

Directories

Path Synopsis
cmd
test_pdf command
internal
nlp
Package nlp provides natural language processing utilities for text analysis.
Package nlp provides natural language processing utilities for text analysis.
pdf
Package search provides hybrid search functionality combining keyword and vector search.
Package search provides hybrid search functionality combining keyword and vector search.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL