docuindex

package module

v0.0.14 Latest Latest Go to latest Published: Jan 11, 2026 License: MIT Imports: 25 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/mmalcek/docuIndex

Links

Open Source Insights

README ¶

DocuIndex

A pure Go package for parsing PDF and DOCX files and extracting structured content optimized for AI search and RAG (Retrieval-Augmented Generation) applications.

** This package is under very active development. Expect frequent updates and improvements. Not yet stable. **

Features

Pure Go - No CGO or external dependencies
PDF Parsing - Complete PDF parser with PostScript content stream interpreter
DOCX Parsing - Full DOCX support via ZIP/XML parsing with style resolution
Custom Data Sources - Index arbitrary structured data with tag-based filtering
Text Extraction - Extract text with positioning, font info, and semantic structure
Image Extraction - Extract embedded images (JPEG, PNG, GIF, BMP, TIFF)
Semantic Analysis - Automatic heading detection, section tracking, keyword extraction
SQLite Storage - Unified SQLite database for all metadata and search indices
Hybrid Search - BM25 keyword search + vector semantic search with RRF fusion
Embedding Providers - Azure OpenAI, OpenAI, and Ollama support
Thread-Safe - Safe for concurrent use

Installation

go get github.com/mmalcek/docuIndex

Quick Start

package main

import (
    "fmt"
    "log"

    "github.com/mmalcek/docuIndex"
)

func main() {
    // Create a store (documents will be saved to ./data directory)
    store, err := docuindex.NewStore("./data")
    if err != nil {
        log.Fatal(err)
    }
    defer store.Close()

    // Index a PDF or DOCX file
    doc, err := store.IndexDocument("./document.pdf")   // PDF
    // doc, err := store.IndexDocument("./document.docx") // DOCX
    if err != nil {
        log.Fatal(err)
    }
    fmt.Printf("Indexed document: %s (%d pages)\n", doc.Info.Name, doc.Info.PageCount)

    // Search across all documents
    results, err := store.Search("search query")
    if err != nil {
        log.Fatal(err)
    }

    for _, r := range results.Results {
        fmt.Printf("Found in %s (page %d): %s\n", r.DocumentName, r.Page, r.Snippet)
    }
}

API Reference

Store Operations

Create a Store

// Basic store
store, err := docuindex.NewStore("./data")

// With options
store, err := docuindex.NewStore("./data",
    docuindex.WithImageExtraction(true),    // Extract images from PDFs
    docuindex.WithChecksum(true),           // Compute SHA-256 checksums
    docuindex.WithSemanticAnalysis(true),   // Enable heading/section detection
    docuindex.WithStemming(true),           // Enable Porter stemming for search
    docuindex.WithStopWords(true),          // Filter common stop words
)

Configure Embedding Provider (Optional)

import "github.com/mmalcek/docuIndex/embedding"

// Azure OpenAI with API key (defaults to API version 2024-10-21)
provider, err := embedding.NewProvider(embedding.Config{
    Provider:   "azure",
    Endpoint:   os.Getenv("AZURE_ENDPOINT"),
    APIKey:     os.Getenv("AZURE_API_KEY"),
    Model:      "text-embedding-3-small",
    // APIVersion: "v1",           // Optional: use new v1 API format
    // APIVersion: "2024-10-21",   // Optional: explicit version (default)
})

// Azure OpenAI with Azure Identity (Managed Identity, DefaultAzureCredential, etc.)
// Optional: requires github.com/Azure/azure-sdk-for-go/sdk/azidentity
import "github.com/Azure/azure-sdk-for-go/sdk/azidentity"

cred, err := azidentity.NewDefaultAzureCredential(nil)
provider, err := embedding.NewProvider(embedding.Config{
    Provider:        "azure",
    Endpoint:        os.Getenv("AZURE_ENDPOINT"),
    Model:           "text-embedding-3-small",
    TokenCredential: cred,  // Uses Bearer token instead of api-key header
})
// Tokens are cached and automatically refreshed before expiry

// OpenAI
provider, err := embedding.NewProvider(embedding.Config{
    Provider: "openai",
    APIKey:   os.Getenv("OPENAI_API_KEY"),
    Model:    "text-embedding-3-small",
})

// Ollama (local)
provider, err := embedding.NewProvider(embedding.Config{
    Provider: "ollama",
    Endpoint: "http://localhost:11434",
    Model:    "nomic-embed-text",
})

// Add to store for semantic search
store.SetEmbeddingProvider(provider)

Index Documents

// Index from file path (PDF or DOCX)
doc, err := store.IndexDocument("./document.pdf")
doc, err := store.IndexDocument("./document.docx")

// Index from io.Reader
file, _ := os.Open("document.pdf")
doc, err := store.IndexReader(file, "document.pdf")

file, _ := os.Open("document.docx")
doc, err := store.IndexReader(file, "document.docx")

// With custom name
doc, err := store.IndexDocument("./document.pdf",
    docuindex.WithName("My Custom Name"),
)

// With custom source and tags for filtering
doc, err := store.IndexDocument("./document.pdf",
    docuindex.WithIndexSource("knowledgebase"),
    docuindex.WithIndexTags(map[string]string{
        "department": "engineering",
        "project":    "alpha",
    }),
)
// Now searchable with:
// store.Search("query", docuindex.WithSources("knowledgebase"))
// store.Search("query", docuindex.WithTags(map[string]string{"department": "engineering"}))

// With deferred embedding (for bulk imports - embed later with EmbedPendingDocuments)
doc, err := store.IndexDocument("./document.pdf",
    docuindex.WithDeferEmbedding(true),
)

Index Custom Data

// Index structured data from any source (creates new document each time)
doc, err := store.IndexCustomData(&docuindex.CustomData{
    Source:      "crm-api",
    Name:        "Customer Notes Q4 2024",
    Description: "Extracted customer interaction notes",
    Tags: map[string]string{
        "quarter": "Q4-2024",
        "type":    "customer-notes",
    },
    ImportedAt: time.Now(), // Optional: track import time for incremental updates
    Entries: []docuindex.DataEntry{
        {Content: "Meeting with Acme Corp about renewal..."},
        {Content: "Support ticket #1234: User reported issue..."},
        {Content: "Sales call summary: Interested in enterprise plan..."},
    },
})

// Index custom data with images
doc, err := store.IndexCustomData(&docuindex.CustomData{
    Source: "products",
    Name:   "Product Catalog",
    Entries: []docuindex.DataEntry{
        {
            ID:      "laptop-001",
            Content: "MacBook Pro 16-inch with M3 chip, 36GB RAM",
            Images: []docuindex.CustomImage{
                {
                    Data:        jpegBytes,           // Image bytes (required)
                    Format:      "jpeg",              // "png", "jpeg", "gif", "bmp" (required)
                    Description: "Front view",       // AI-friendly alt text (optional)
                    OriginalName: "macbook.jpg",     // Display name (optional)
                    // Width/Height auto-detected if not provided
                },
            },
        },
    },
    // Document-level images (not tied to specific entry)
    Images: []docuindex.CustomImage{
        {Data: logoPng, Format: "png", Description: "Company logo"},
    },
})

// Upsert custom data (update existing document if source + external_id match)
// On update, existing images are automatically replaced
doc, err := store.UpsertCustomData(&docuindex.CustomData{
    Source:      "salesforce-api",
    Name:        "Salesforce Opportunities",
    ExternalID:  "opportunities-q4",  // Optional - enables update-or-create behavior
    ImportedAt:  time.Now(),
    Entries: []docuindex.DataEntry{
        {Content: "Acme Corp - $50k deal in progress..."},
        {Content: "Widget Inc - Renewal pending..."},
    },
})

// Get last import time for incremental updates
lastImport, err := store.GetLastImportTime("crm-api")
if !lastImport.IsZero() {
    // Fetch only new data from source since lastImport
}

Retrieve Documents

// Get by ID
doc, err := store.GetDocument("document-id")

// Find by external system ID (e.g., Salesforce record ID)
doc, err := store.FindByExternalID("salesforce-api", "opportunities-q4")
if doc != nil {
    fmt.Printf("Found: %s\n", doc.Info.Name)
}

// List all documents
docs, err := store.ListDocuments()
for _, info := range docs {
    fmt.Printf("%s: %s (%d pages)\n", info.ID, info.Name, info.PageCount)
}

// Delete a document
err := store.DeleteDocument("document-id")

Search Operations

Basic Search (BM25 Keyword Search)

results, err := store.Search("machine learning")

for _, r := range results.Results {
    fmt.Printf("Document: %s\n", r.DocumentName)
    fmt.Printf("Page: %d\n", r.Page)
    fmt.Printf("Section: %s\n", r.Section)
    fmt.Printf("Score: %.2f\n", r.Score)
    fmt.Printf("Snippet: %s\n", r.Snippet)
}

// Search with images included in results
results, err = store.Search("diagram",
    docuindex.WithImages(true),
)
for _, r := range results.Results {
    if len(r.Images) > 0 {
        fmt.Printf("Images in section: %v\n", r.Images)
        // e.g., ["images/uuid1.png", "images/uuid2.jpeg"]
    }
}

Search Modes

// Hybrid search (BM25 + vectors with RRF fusion) - default
// Falls back to keyword-only if no embedding provider is configured
results, err := store.Search("climate change impacts",
    docuindex.WithVectorWeight(0.6),   // Weight for semantic results
    docuindex.WithKeywordWeight(0.4),  // Weight for keyword results
)

// Keyword search (BM25 only)
results, err := store.Search("neural networks",
    docuindex.WithSearchMode(docuindex.SearchModeKeyword),
)

// Semantic search (vector embeddings only) - requires embedding provider
results, err := store.Search("how does machine learning work",
    docuindex.WithSearchMode(docuindex.SearchModeSemantic),
)

Search with Options

results, err := store.Search("neural networks",
    docuindex.WithMaxResults(10),           // Limit results
    docuindex.WithMinScore(0.5),            // Minimum relevance score
    docuindex.WithContextWindow(3),         // Include 3 blocks before/after
    docuindex.WithHighlight("<b>", "</b>"), // Highlight matches in snippet
)

Search Options Reference

Core Options

Option	Type	Default	Description
`WithMaxResults(n)`	int	100	Maximum number of results to return
`WithMinScore(score)`	float64	0.0	Filter out results below this relevance threshold
`WithContextWindow(blocks)`	int	2	Number of surrounding blocks to include for RAG context
`WithHighlight(pre, post)`	string, string	"", ""	Markers to wrap matched terms in snippets
`WithPageRange(start, end)`	int, int	-	Limit search to specific page range
`WithDocuments(...ids)`	[]string	-	Search only within specified document IDs
`WithSections(...names)`	[]string	-	Limit search to specific section names
`WithImages(bool)`	bool	false	Include image paths in search results

Search Mode Options

Option	Type	Default	Description
`WithSearchMode(mode)`	SearchMode	Hybrid	Search strategy: `SearchModeKeyword`, `SearchModeSemantic`, or `SearchModeHybrid`
`WithVectorWeight(weight)`	float64	0.5	Weight for semantic/vector search results (0.0-1.0)
`WithKeywordWeight(weight)`	float64	0.5	Weight for BM25 keyword search results (0.0-1.0)
`WithEfSearch(ef)`	int	50	HNSW search thoroughness (50=fast, 100=balanced, 200+=high recall)

Filtering Options

Option	Type	Default	Description
`WithSources(...sources)`	[]string	-	Filter by source or format (e.g., "pdf", "crm", "docx")
`WithTags(tags)`	map[string]string	-	Filter by tags with AND logic; use "!" prefix for negation
`WithMetadata(bool)`	bool	false	Include tags, source, external_id in results
`WithFilter(filter)`	*Filter	-	Advanced Filter DSL for complex queries

AI/Agent Options

Option	Type	Default	Description
`WithAgentOutput(bool)`	bool	false	Return structured `AgentSearchResponse` format
`WithEstimateTokens(bool)`	bool	false	Include token count estimates in results
`WithCitations(bool)`	bool	false	Add citation references [1], [2], etc.
`WithChunking(opts)`	ChunkOptions	-	Configure result chunking for LLM context windows

Advanced Options

Option	Type	Default	Description
`WithDiversify(maxPerDoc)`	int	0	Limit results per document for variety (0 = unlimited)
`WithDiagnostics(bool)`	bool	false	Include detailed search diagnostics in results

Understanding Vector and Keyword Weights

The WithVectorWeight and WithKeywordWeight options control how hybrid search combines semantic (vector) and keyword (BM25) results using Reciprocal Rank Fusion (RRF).

How weights work:

Both values should be between 0.0 and 1.0
They control the relative importance of each search signal
The weights don't need to sum to 1.0, but typically do for clarity

Practical examples:

// Balanced hybrid search (default behavior)
results, err := store.Search("machine learning concepts",
    docuindex.WithVectorWeight(0.5),
    docuindex.WithKeywordWeight(0.5),
)

// Emphasize semantic understanding - good for conceptual queries
// "How does photosynthesis work?" benefits from semantic search
results, err := store.Search("how does photosynthesis work",
    docuindex.WithVectorWeight(0.7),
    docuindex.WithKeywordWeight(0.3),
)

// Emphasize keyword matching - good for specific terms/names
// Searching for "RFC 7231" or "John Smith" benefits from keyword search
results, err := store.Search("RFC 7231",
    docuindex.WithVectorWeight(0.3),
    docuindex.WithKeywordWeight(0.7),
)

// Pure semantic search (equivalent to SearchModeSemantic)
results, err := store.Search("explain the concept",
    docuindex.WithVectorWeight(1.0),
    docuindex.WithKeywordWeight(0.0),
)

// Pure keyword search (equivalent to SearchModeKeyword)
results, err := store.Search("exact phrase",
    docuindex.WithVectorWeight(0.0),
    docuindex.WithKeywordWeight(1.0),
)

When to adjust weights:

Query Type	Recommended Weights	Reason
Conceptual questions	Vector: 0.7, Keyword: 0.3	Semantic search understands meaning
Specific terms/IDs	Vector: 0.3, Keyword: 0.7	Keywords need exact matching
Technical documentation	Vector: 0.5, Keyword: 0.5	Balance both signals
Natural language	Vector: 0.6, Keyword: 0.4	Slight semantic preference
Code/API references	Vector: 0.2, Keyword: 0.8	Exact symbol matching important

HNSW Search Tuning with EfSearch

The WithEfSearch option controls how thoroughly the HNSW vector index is searched. Higher values improve recall (finding more relevant results) at the cost of latency.

// Fast search - good for real-time applications
results, err := store.Search("query",
    docuindex.WithEfSearch(50),
)

// Balanced - good for most use cases
results, err := store.Search("query",
    docuindex.WithEfSearch(100),
)

// High recall - good for batch processing or critical queries
results, err := store.Search("important query",
    docuindex.WithEfSearch(200),
)

EfSearch guidelines by dataset size:

Block Count	Recommended EfSearch	Notes
< 10,000	50 (default)	Fast, high recall naturally
10,000 - 50,000	100	Good balance
50,000 - 100,000	100-150	May need tuning
> 100,000	200+	Higher values for critical searches

Note: WithEfSearch only affects the current query. To set a default for all searches, use WithHNSWConfig when creating the store:

store, _ := docuindex.NewStore("./data",
    docuindex.WithHNSWConfig(docuindex.HNSWConfig{
        EfSearch: 100,  // Default for all queries
    }),
)

Boolean and Phrase Queries

// Boolean operators: AND, OR, NOT (or +, -)
results, err := store.Search("machine learning AND neural")
results, err := store.Search("+required -excluded optional")

// Phrase matching with quotes
results, err := store.Search(`"exact phrase match"`)

Search in Specific Document

results, err := store.SearchInDocument("doc-id", "query")

Search with Source/Tag Filtering

// Search only custom data
results, err := store.Search("renewal",
    docuindex.WithSources("customdata"))

// Search by specific source
results, err := store.Search("renewal",
    docuindex.WithSources("crm-api"))

// Search with tag filter
results, err := store.Search("renewal",
    docuindex.WithTags(map[string]string{"quarter": "Q4-2024"}))

// Search with tag negation (! prefix excludes matching values)
results, err := store.Search("bugs",
    docuindex.WithTags(map[string]string{"status": "!Closed"}))  // Not Closed

// Combined filters
results, err := store.Search("enterprise",
    docuindex.WithSources("crm-api", "faq"),
    docuindex.WithTags(map[string]string{"type": "customer-notes"}))

Search with Metadata

Include document metadata (tags, source, external ID) in search results:

// Search with metadata included
results, err := store.Search("query", docuindex.WithMetadata(true))

for _, r := range results.Results {
    fmt.Printf("Document: %s\n", r.DocumentName)
    fmt.Printf("  Source: %s\n", r.Source)           // e.g., "crm-api"
    fmt.Printf("  ExternalID: %s\n", r.ExternalID)   // e.g., "salesforce-123"
    fmt.Printf("  Tags: %v\n", r.Tags)               // e.g., {"status": "Open", "priority": "High"}
}

// Combine with tag negation for filtering
results, err := store.Search("bugs",
    docuindex.WithTags(map[string]string{"status": "!Closed"}),
    docuindex.WithMetadata(true),  // Include metadata in results
)

Note: WithMetadata is disabled by default for performance. Enable it when you need to access tags, source, or external ID from search results.

Search with Diversification

When a single document contains many matching blocks, diversification limits results per document to improve variety:

// Limit to max 3 results per document
results, err := store.Search("machine learning",
    docuindex.WithDiversify(3),
)

Search Diagnostics

Enable diagnostics to understand search performance and behavior:

results, err := store.Search("query",
    docuindex.WithDiagnostics(true),
)

if results.Diagnostics != nil {
    fmt.Printf("Keyword results: %d\n", results.Diagnostics.KeywordResults)
    fmt.Printf("Vector results: %d\n", results.Diagnostics.VectorResults)
    fmt.Printf("Filtered by MinScore: %d\n", results.Diagnostics.FilteredByScore)
    fmt.Printf("Diversified from: %d results\n", results.Diagnostics.DiversifiedFrom)
    fmt.Printf("Timing - Keyword: %v, Vector: %v, Fusion: %v\n",
        results.Diagnostics.KeywordTime,
        results.Diagnostics.VectorTime,
        results.Diagnostics.FusionTime)
}

Get Context for RAG

// Get surrounding content blocks for a specific block
ctx, err := store.GetContext("doc-id", "blk_042", 5)

// ctx.Before - blocks before the target
// ctx.Center - the target block
// ctx.After - blocks after the target

Document Structure

Content Block

type ContentBlock struct {
    ID       string       // Unique block ID (e.g., "blk_001")
    Type     BlockType    // text, heading, image, list, table
    Content  string       // Text content or image path
    Page     int          // 1-indexed page number
    BBox     BoundingBox  // Position on page
    Font     *FontInfo    // Font metadata
    Semantic SemanticInfo // Heading level, section, keywords
}

Working with Blocks

doc, _ := store.GetDocument("doc-id")

// Get all text blocks
textBlocks := doc.GetTextBlocks()

// Get all image blocks
imageBlocks := doc.GetImageBlocks()

// Get blocks from a specific page
page3Blocks := doc.GetBlocksByPage(3)

// Find a specific block
block := doc.GetBlockByID("blk_042")

// Get images by document with optional filters
images, err := store.GetImagesByDocumentFiltered("doc-id", "", 0) // All images
images, err := store.GetImagesByDocumentFiltered("doc-id", "Introduction", 0) // By section
images, err := store.GetImagesByDocumentFiltered("doc-id", "", 3) // By page

// Get image info with AI-friendly description
info, err := store.GetImageInfo("image-uuid")
fmt.Printf("Image: %s - %s\n", info.OriginalName, info.Description)
// e.g., "macbook.jpg - Front view of MacBook Pro 16-inch"

Store Statistics

stats := store.Stats()
fmt.Printf("Documents: %d\n", stats.DocumentCount)
fmt.Printf("Total blocks: %d\n", stats.TotalBlocks)
fmt.Printf("Total images: %d\n", stats.TotalImages)
fmt.Printf("Index terms: %d\n", stats.IndexTerms)
fmt.Printf("Vectors: %d\n", stats.VectorCount)

Embedding Status

Check whether embeddings have been generated for a document:

// Quick check if any embeddings exist
hasEmb, err := store.HasEmbeddings(docID)
if hasEmb {
    fmt.Println("Document has embeddings")
}

// Get detailed embedding status
status, err := store.GetEmbeddingStatus(docID)
fmt.Printf("Progress: %.1f%% (%d/%d blocks)\n",
    status.Progress(), status.EmbeddedCount, status.TotalEmbeddable)

if status.IsComplete {
    fmt.Println("Fully embedded")
} else if status.HasEmbeddings {
    fmt.Println("Partially embedded")
} else {
    fmt.Println("No embeddings")
}

// EmbeddingStatus fields:
// - HasEmbeddings   bool      - true if any embeddings exist
// - IsComplete      bool      - true if all embeddable blocks have vectors
// - EmbeddedCount   int       - number of blocks with embeddings
// - TotalEmbeddable int       - number of blocks that can be embedded
// - Model           string    - embedding model used
// - Dimension       int       - vector dimension
// - LastUpdated     time.Time - when embeddings were last updated

Database Info

Get information about the database schema and library version (useful for debugging):

info, err := store.DatabaseInfo()
fmt.Printf("Schema version: %d\n", info.SchemaVersion)
fmt.Printf("Library version: %s\n", info.LibraryVersion)
fmt.Printf("Created: %s\n", info.CreatedAt)
fmt.Printf("Last migration: %s\n", info.LastMigration)

// DatabaseInfo fields:
// - SchemaVersion  int       - current database schema version
// - LibraryVersion string    - library version that created/migrated the DB
// - CreatedAt      time.Time - when database was first created
// - LastMigration  time.Time - when last schema migration was applied

Recommended Design Patterns

Bulk Import (Large Datasets)

For importing more than 1000 records, use batch mode with deferred embedding:

// Configure store for bulk import
store, _ := docuindex.NewStore("./data",
    docuindex.WithHNSWConfig(docuindex.HNSWConfig{
        EfConst: 64, // Faster construction for bulk import
    }),
)

// Prepare data
var allData []*docuindex.CustomData
for _, record := range records {
    allData = append(allData, &docuindex.CustomData{
        Source:     "my-source",
        ExternalID: record.ID,
        Name:       record.Title,
        Entries:    []docuindex.DataEntry{{Content: record.Content}},
    })
}

// Batch index with deferred embedding
docs, err := store.IndexCustomDataBatch(allData,
    docuindex.WithDeferEmbedding(true),
)

// Generate embeddings after all documents indexed
err = store.EmbedPendingDocuments()

Incremental Sync

For ongoing synchronization from external sources:

// Get last sync timestamp
lastImport, _ := store.GetLastImportTime("my-source")

// Fetch only changed records
changedRecords := fetchRecordsModifiedSince(lastImport)

// Index in batches with deferred embedding
const batchSize = 100
for i := 0; i < len(changedRecords); i += batchSize {
    batch := changedRecords[i:min(i+batchSize, len(changedRecords))]

    for _, record := range batch {
        store.UpsertCustomData(&docuindex.CustomData{
            Source:     "my-source",
            ExternalID: record.ID,
            Name:       record.Title,
            ImportedAt: time.Now(),
            Entries:    []docuindex.DataEntry{{Content: record.Content}},
        },
            docuindex.WithDeferEmbedding(true),
        )
    }
}

// Process embeddings (resumable if interrupted)
err = store.EmbedPendingDocuments()

Resumable Embedding Maintenance

For background processing that survives restarts:

// Find documents without embeddings (scheduled task / startup)
pending, _ := store.GetDocumentsWithoutEmbeddings()
if len(pending) > 0 {
    log.Printf("Processing %d pending documents", len(pending))
    err := store.EmbedPendingDocuments()
}

Background Embedding

For non-blocking embedding that allows your application to remain responsive:

// Start background embedding (returns immediately)
err := store.EmbedPendingDocumentsAsync()
if err != nil {
    log.Fatal(err)
}

// Check progress periodically
for store.IsBackgroundRunning() {
    status := store.GetBackgroundStatus()
    fmt.Printf("Progress: %.1f%% (%d/%d documents)\n",
        status.Progress(),
        status.DocumentsDone,
        status.DocumentsTotal)
    time.Sleep(time.Second)
}

// Or block until completion
if err := store.WaitForBackground(); err != nil {
    log.Printf("Background embedding failed: %v", err)
}

// Cancel if needed (e.g., on application shutdown)
store.CancelBackground()

The BackgroundEmbeddingStatus provides detailed progress:

Running - whether embedding is in progress
DocumentsTotal / DocumentsDone - progress counters
CurrentDocID / CurrentDocName - currently processing document
ElapsedTime - time since start
Error - error if failed
Progress() - completion percentage (0-100)

HNSW Tuning by Use Case

Use Case	EfConst	EfSearch	Notes
Small dataset (<10k)	200	50	Default - best quality
Bulk import	64	100	Faster construction
High-recall search	200	200	Slower but more accurate
Real-time indexing	100	50	Balanced

// Store-wide configuration (applied to all searches)
store, _ := docuindex.NewStore("./data",
    docuindex.WithHNSWConfig(docuindex.HNSWConfig{
        M:        16,   // Max connections (default)
        EfConst:  64,   // For bulk import
        EfSearch: 100,  // Good search quality
    }),
)

// Per-query override for recall vs latency tradeoff
results, _ := store.Search("important query",
    docuindex.WithEfSearch(200),  // Higher recall for this query
)
// Recommended: 50=fast, 100=balanced, 200+=high recall

See OPTIMISATIONS.md for detailed performance tuning guide.

Storage Architecture

DocuIndex uses a unified SQLite database for all metadata and search indices:

data/
├── docuindex.db           # SQLite database (all metadata)
├── hnsw.idx               # HNSW vector index (binary)
└── images/                # Extracted images with UUID names
    ├── a1b2c3d4-e5f6-7890-abcd-ef1234567890.png
    └── ...

Database Schema

The SQLite database contains:

documents - Document metadata (name, path, format, page count, timestamps)
content_blocks - Parsed content with position, font, and semantic info
search_terms - BM25 inverted index with term positions
document_stats - Statistics for BM25 ranking
vectors - Block embeddings as BLOBs
images - Image metadata with AI-friendly description (actual files in images/ folder)

Search Capabilities

BM25 Keyword Search

Industry-standard relevance ranking
Boolean queries (AND, OR, NOT)
Phrase matching with position data
Porter stemming and stop word filtering
Heading boost (1.5x)

Semantic Vector Search

HNSW approximate nearest neighbor
Supports Azure OpenAI, OpenAI, Ollama
Block-level embeddings for granular retrieval
Cosine similarity distance

Hybrid Search

Combines BM25 + vector results
Reciprocal Rank Fusion (RRF) scoring
Configurable weights

Supported PDF Features

PDF 1.0 - 1.7
Traditional and cross-reference stream xref tables
Stream filters: FlateDecode, ASCIIHexDecode, ASCII85Decode, LZWDecode, RunLengthDecode
Font types: Type1, TrueType, Type0 (CID), Type3
Encoding support: WinAnsi, MacRoman, Standard, PDFDocEncoding
ToUnicode CMap for proper character mapping
Content stream operators for text positioning and graphics state
Embedded images (DCTDecode/JPEG, PNG)

Supported DOCX Features

Full ZIP archive parsing via standard library
XML content parsing with namespace handling
Style-based and font-based heading detection
Style inheritance chain resolution
Bullet and numbered list extraction
Table content with row/column structure
Inline and anchored image extraction (JPEG, PNG, GIF, BMP, TIFF)
Dublin Core metadata (title, author, keywords)
Application properties (page count, word count)
Field instructions (TOC, page numbers, hyperlinks)
Position estimation for search result context

Dependencies

modernc.org/sqlite - Pure Go SQLite (no CGO)
github.com/google/uuid - UUID generation
Standard library for everything else

Limitations

Encrypted PDFs are not supported
JBIG2Decode and CCITTFaxDecode PDF filters have limited support
DOCX position estimation is approximate (DOCX lacks exact positioning unlike PDF)
DOCX vector images (EMF, WMF) are detected but skipped

License

MIT License

Documentation ¶

Index ¶

Constants
Variables
func ChunkBlocks(blocks []ContentBlock, opts ChunkOptions) [][]ContentBlock
func ChunkSearchResults(results []SearchResult, maxTokens int) [][]SearchResult
func CombineChunkedBlocks(blocks []ContentBlock, separator string) string
func EstimateAgentResultTokens(results []AgentSearchResult) int
func EstimateBlockTokens(block *ContentBlock) int
func EstimateContextTokens(ctx *ContextResult) int
func EstimateResultTokens(results []SearchResult) int
func EstimateTokens(text string) int
func FitsInContext(text string, maxTokens int) bool
func IsCustomDataError(err error) bool
func IsDOCXError(err error) bool
func IsObjectError(err error) bool
func IsParseError(err error) bool
func IsSearchError(err error) bool
func IsStorageError(err error) bool
func QueryTypeDescription(qt QueryType) string
func TruncateToTokenLimit(text string, maxTokens int) string
type AgentSearchResponse
type AgentSearchResult
type BackgroundEmbeddingStatus
- func (s BackgroundEmbeddingStatus) Progress() float64
type BlockType
type BoundingBox
- func (b BoundingBox) RelativePosition() (xPct, yPct, wPct, hPct float64)
type Chunk
- func ChunkContent(content string, opts ChunkOptions) []Chunk
type ChunkOptions
- func DefaultChunkOptions() ChunkOptions
type ContentBlock
type ContextResult
type CustomData
type CustomDataError
- func NewCustomDataError(source, message string, err error) *CustomDataError
- func (e *CustomDataError) Error() string
- func (e *CustomDataError) Unwrap() error
type CustomImage
type DOCXError
- func NewDOCXError(part, message string, err error) *DOCXError
- func (e *DOCXError) Error() string
- func (e *DOCXError) Unwrap() error
type DataEntry
type DatabaseInfo
type DateRange
type DedupResult
type Document
- func (d *Document) GetBlockByID(id string) *ContentBlock
- func (d *Document) GetBlocksByPage(page int) []ContentBlock
- func (d *Document) GetImageBlocks() []ContentBlock
- func (d *Document) GetTextBlocks() []ContentBlock
type DocumentContent
type DocumentFormat
type DocumentInfo
type EmbeddingStatus
- func (e *EmbeddingStatus) Progress() float64
type Filter
- func NewFilter() *Filter
- func (f *Filter) After(t time.Time) *Filter
- func (f *Filter) Before(t time.Time) *Filter
- func (f *Filter) Build() *FilterConfig
- func (f *Filter) DateRange(start, end time.Time) *Filter
- func (f *Filter) ExternalIDs(ids ...string) *Filter
- func (f *Filter) Formats(formats ...string) *Filter
- func (f *Filter) GetDateRange() *DateRange
- func (f *Filter) GetExternalIDs() []string
- func (f *Filter) GetFormats() []string
- func (f *Filter) GetHasEmbeddings() *bool
- func (f *Filter) GetMaxPageCount() int
- func (f *Filter) GetMinPageCount() int
- func (f *Filter) GetSources() []string
- func (f *Filter) GetTags() map[string]string
- func (f *Filter) HasEmbeddings(has bool) *Filter
- func (f *Filter) IsEmpty() bool
- func (f *Filter) MaxPages(n int) *Filter
- func (f *Filter) MinPages(n int) *Filter
- func (f *Filter) Sources(sources ...string) *Filter
- func (f *Filter) Tag(key, value string) *Filter
- func (f *Filter) Tags(tags map[string]string) *Filter
type FilterConfig
type FontError
- func NewFontError(fontName, message string, err error) *FontError
- func (e *FontError) Error() string
- func (e *FontError) Unwrap() error
type FontInfo
type HNSWConfig
type ImageInfo
type IndexOption
- func WithDeferEmbedding(defer_ bool) IndexOption
- func WithForceReindex(force bool) IndexOption
- func WithIndexSource(source string) IndexOption
- func WithIndexTags(tags map[string]string) IndexOption
- func WithName(name string) IndexOption
- func WithProgressCallback(fn ProgressCallback) IndexOption
- func WithSourcePath(path string) IndexOption
type IndexProgress
type ObjectError
- func NewObjectError(objNum, genNum int, message string, err error) *ObjectError
- func (e *ObjectError) Error() string
- func (e *ObjectError) Unwrap() error
type Page
type PageError
- func NewPageError(pageNum int, message string, err error) *PageError
- func (e *PageError) Error() string
- func (e *PageError) Unwrap() error
type ParseError
- func NewParseError(op string, offset int64, message string, err error) *ParseError
- func (e *ParseError) Error() string
- func (e *ParseError) Unwrap() error
type Posting
type ProgressCallback
type QueryType
- func DetectQueryType(query string) QueryType
type SearchDiagnostics
type SearchError
- func NewSearchError(query, message string, err error) *SearchError
- func (e *SearchError) Error() string
- func (e *SearchError) Unwrap() error
type SearchMode
- func SuggestedSearchMode(qt QueryType) SearchMode
type SearchOption
- func WithAgentOutput(enabled bool) SearchOption
- func WithChunking(opts ChunkOptions) SearchOption
- func WithCitations(enabled bool) SearchOption
- func WithContextWindow(blocks int) SearchOption
- func WithDiagnostics(enabled bool) SearchOption
- func WithDiversify(maxPerDoc int) SearchOption
- func WithDocuments(docIDs ...string) SearchOption
- func WithEfSearch(ef int) SearchOption
- func WithEstimateTokens(enabled bool) SearchOption
- func WithFilter(f *Filter) SearchOption
- func WithHighlight(pre, post string) SearchOption
- func WithImages(include bool) SearchOption
- func WithKeywordWeight(weight float64) SearchOption
- func WithMaxResults(n int) SearchOption
- func WithMetadata(include bool) SearchOption
- func WithMinScore(score float64) SearchOption
- func WithPageRange(start, end int) SearchOption
- func WithSearchMode(mode SearchMode) SearchOption
- func WithSections(sections ...string) SearchOption
- func WithSources(sources ...string) SearchOption
- func WithTags(tags map[string]string) SearchOption
- func WithVectorWeight(weight float64) SearchOption
type SearchResult
type SearchResults
type SemanticInfo
type StorageError
- func NewStorageError(op, path, message string, err error) *StorageError
- func (e *StorageError) Error() string
- func (e *StorageError) Unwrap() error
type Store
- func NewStore(basePath string, opts ...StoreOption) (*Store, error)
- func (s *Store) CancelBackground()
- func (s *Store) CheckDuplicate(path string) (*DedupResult, error)
- func (s *Store) CheckDuplicateByContent(data []byte) (*DedupResult, error)
- func (s *Store) CheckHealth() (*StoreHealth, error)
- func (s *Store) Close() error
- func (s *Store) DatabaseInfo() (*DatabaseInfo, error)
- func (s *Store) DeleteDocument(id string) error
- func (s *Store) DetectQueryType(query string) QueryType
- func (s *Store) EmbedDocuments(docIDs ...string) error
- func (s *Store) EmbedPendingDocuments() error
- func (s *Store) EmbedPendingDocumentsAsync() error
- func (s *Store) FindByExternalID(source, externalID string) (*Document, error)
- func (s *Store) GetBackgroundStatus() BackgroundEmbeddingStatus
- func (s *Store) GetContext(docID, blockID string, windowSize int) (*ContextResult, error)
- func (s *Store) GetDocument(id string) (*Document, error)
- func (s *Store) GetDocumentsWithIncompleteEmbeddings() ([]*DocumentInfo, error)
- func (s *Store) GetDocumentsWithoutEmbeddings() ([]*DocumentInfo, error)
- func (s *Store) GetEmbeddingStatus(docID string) (*EmbeddingStatus, error)
- func (s *Store) GetImagesByDocumentFiltered(docID, section string, page int) ([]ImageInfo, error)
- func (s *Store) GetLastImportTime(source string) (time.Time, error)
- func (s *Store) HasEmbeddings(docID string) (bool, error)
- func (s *Store) IndexCustomData(data *CustomData, opts ...IndexOption) (*Document, error)
- func (s *Store) IndexCustomDataBatch(data []*CustomData, opts ...IndexOption) ([]*Document, error)
- func (s *Store) IndexDocument(path string, opts ...IndexOption) (*Document, error)
- func (s *Store) IndexDocumentWithProgress(path string, callback ProgressCallback, opts ...IndexOption) (*Document, error)
- func (s *Store) IndexReader(r io.Reader, name string, opts ...IndexOption) (*Document, error)
- func (s *Store) IsBackgroundRunning() bool
- func (s *Store) ListDocuments() ([]*DocumentInfo, error)
- func (s *Store) Repair() error
- func (s *Store) ResumeAllIncompleteEmbeddings() error
- func (s *Store) ResumeEmbedding(docID string) error
- func (s *Store) Search(query string, opts ...SearchOption) (*SearchResults, error)
- func (s *Store) SearchForAgent(query string, opts ...SearchOption) (*AgentSearchResponse, error)
- func (s *Store) SearchInDocument(docID, query string, opts ...SearchOption) (*SearchResults, error)
- func (s *Store) SetEmbeddingProvider(provider embedding.Provider) error
- func (s *Store) Stats() StoreStats
- func (s *Store) UpsertCustomData(data *CustomData, opts ...IndexOption) (*Document, error)
- func (s *Store) WaitForBackground() error
type StoreHealth
type StoreOption
- func WithCache(enabled bool, size int) StoreOption
- func WithChecksum(enabled bool) StoreOption
- func WithDedupCheck(enabled bool) StoreOption
- func WithHNSWConfig(cfg HNSWConfig) StoreOption
- func WithImageExtraction(enabled bool) StoreOption
- func WithMaxConcurrency(n int) StoreOption
- func WithNGrams(enabled bool, size int) StoreOption
- func WithSemanticAnalysis(enabled bool) StoreOption
- func WithStemming(enabled bool) StoreOption
- func WithStopWords(enabled bool) StoreOption
type StoreStats
type StreamError
- func NewStreamError(filter, message string, err error) *StreamError
- func (e *StreamError) Error() string
- func (e *StreamError) Unwrap() error
type TermEntry
type TokenBudget
- func NewTokenBudget(maxTokens int) *TokenBudget
- func (b *TokenBudget) Add(tokens int) bool
- func (b *TokenBudget) AddText(text string) bool
- func (b *TokenBudget) IsExhausted() bool
- func (b *TokenBudget) Remaining() int
- func (b *TokenBudget) Reset()
- func (b *TokenBudget) Usage() float64

Constants ¶

View Source

const Version = "0.0.14"

Version is the current library version (semver)

Variables ¶

View Source

var (
	// PDF parsing errors
	ErrInvalidPDF         = errors.New("invalid PDF file")
	ErrCorruptedPDF       = errors.New("corrupted PDF structure")
	ErrUnsupportedVersion = errors.New("unsupported PDF version")
	ErrEncryptedPDF       = errors.New("encrypted PDF not supported")

	// DOCX parsing errors
	ErrInvalidDOCX    = errors.New("invalid DOCX file")
	ErrCorruptedDOCX  = errors.New("corrupted DOCX structure")
	ErrMissingContent = errors.New("missing document.xml in DOCX")

	// Feature errors
	ErrUnsupportedFeature  = errors.New("unsupported PDF feature")
	ErrUnsupportedEncoding = errors.New("unsupported text encoding")
	ErrUnsupportedFilter   = errors.New("unsupported stream filter")
	ErrUnsupportedFont     = errors.New("unsupported font type")
	ErrUnsupportedImage    = errors.New("unsupported image format")

	// Storage errors
	ErrDocumentNotFound = errors.New("document not found")
	ErrDocumentExists   = errors.New("document already exists")
	ErrStorageCorrupted = errors.New("storage corrupted")
	ErrStorageFull      = errors.New("storage full")

	// Search errors
	ErrSearchFailed   = errors.New("search failed")
	ErrInvalidQuery   = errors.New("invalid search query")
	ErrIndexCorrupted = errors.New("search index corrupted")

	// General errors
	ErrInvalidInput = errors.New("invalid input")
	ErrIOError      = errors.New("I/O error")

	// CustomData errors
	ErrInvalidCustomData = errors.New("invalid custom data")
	ErrMissingSource     = errors.New("custom data source is required")
	ErrMissingEntries    = errors.New("custom data must have at least one entry")
)

Sentinel errors for common cases

Functions ¶

func ChunkBlocks ¶

func ChunkBlocks(blocks []ContentBlock, opts ChunkOptions) [][]ContentBlock

ChunkBlocks regroups content blocks based on token limits

func ChunkSearchResults ¶

func ChunkSearchResults(results []SearchResult, maxTokens int) [][]SearchResult

ChunkSearchResults chunks search results to fit within a token budget

func CombineChunkedBlocks ¶

func CombineChunkedBlocks(blocks []ContentBlock, separator string) string

CombineChunkedBlocks combines a group of blocks into a single string

func EstimateAgentResultTokens ¶

func EstimateAgentResultTokens(results []AgentSearchResult) int

EstimateAgentResultTokens estimates total tokens for agent search results

func EstimateBlockTokens ¶

func EstimateBlockTokens(block *ContentBlock) int

EstimateBlockTokens estimates tokens for a content block

func EstimateContextTokens ¶

func EstimateContextTokens(ctx *ContextResult) int

EstimateContextTokens estimates tokens for a context window

func EstimateResultTokens ¶

func EstimateResultTokens(results []SearchResult) int

EstimateResultTokens estimates total tokens for search results

func EstimateTokens ¶

func EstimateTokens(text string) int

EstimateTokens estimates the token count for a given text string. This uses an approximation based on cl100k_base tokenizer behavior. For English text, it averages ~4 characters per token.

func FitsInContext ¶

func FitsInContext(text string, maxTokens int) bool

FitsInContext checks if content fits within a token budget

func IsCustomDataError ¶

func IsCustomDataError(err error) bool

IsCustomDataError checks if an error is a custom data error

func IsDOCXError ¶

func IsDOCXError(err error) bool

IsDOCXError checks if an error is a DOCX error

func IsObjectError ¶

func IsObjectError(err error) bool

IsObjectError checks if an error is a PDF object error

func IsParseError ¶

func IsParseError(err error) bool

IsParseError checks if an error is a PDF parsing error

func IsSearchError ¶

func IsSearchError(err error) bool

IsSearchError checks if an error is a search error

func IsStorageError ¶

func IsStorageError(err error) bool

IsStorageError checks if an error is a storage error

func QueryTypeDescription ¶

func QueryTypeDescription(qt QueryType) string

QueryTypeDescription returns a human-readable description of the query type

func TruncateToTokenLimit ¶

func TruncateToTokenLimit(text string, maxTokens int) string

TruncateToTokenLimit truncates text to approximately fit within a token limit

Types ¶

type AgentSearchResponse ¶

type AgentSearchResponse struct {
	Query           string              `json:"query"`
	QueryType       QueryType           `json:"query_type"`
	Results         []AgentSearchResult `json:"results"`
	TotalHits       int                 `json:"total_hits"`
	SearchTime      time.Duration       `json:"search_time"`
	EstimatedTokens int                 `json:"estimated_tokens"`
	Metadata        map[string]any      `json:"metadata,omitempty"`
}

AgentSearchResponse provides AI agent-friendly search results

type AgentSearchResult ¶

type AgentSearchResult struct {
	DocumentID   string         `json:"document_id"`
	DocumentName string         `json:"document_name"`
	BlockID      string         `json:"block_id"`
	Content      string         `json:"content"`
	Snippet      string         `json:"snippet"`
	Score        float64        `json:"score"`
	Page         int            `json:"page"`
	Section      string         `json:"section"`
	CitationRef  string         `json:"citation_ref"` // e.g., "[1]", "[2]"
	TokenCount   int            `json:"token_count"`
	Context      []ContentBlock `json:"context,omitempty"`
	Images       []string       `json:"images,omitempty"`
}

AgentSearchResult is a single result optimized for AI agent consumption

type BackgroundEmbeddingStatus ¶ added in v0.0.9

type BackgroundEmbeddingStatus struct {
	Running        bool          `json:"running"`         // Is background build in progress
	StartedAt      time.Time     `json:"started_at"`      // When build started
	DocumentsTotal int           `json:"documents_total"` // Total documents to process
	DocumentsDone  int           `json:"documents_done"`  // Documents processed so far
	CurrentDocID   string        `json:"current_doc_id"`  // Currently processing document
	CurrentDocName string        `json:"current_doc_name"`
	ElapsedTime    time.Duration `json:"elapsed_time"`
	Error          error         `json:"error,omitempty"` // Error if failed
}

BackgroundEmbeddingStatus represents the status of background HNSW building

func (BackgroundEmbeddingStatus) Progress ¶ added in v0.0.9

func (s BackgroundEmbeddingStatus) Progress() float64

Progress returns the completion percentage (0-100)

type BlockType ¶

type BlockType string

BlockType represents the type of content block

const (
	BlockTypeText    BlockType = "text"
	BlockTypeHeading BlockType = "heading"
	BlockTypeImage   BlockType = "image"
	BlockTypeList    BlockType = "list"
	BlockTypeTable   BlockType = "table"
	BlockTypeCustom  BlockType = "custom" // Custom data entry
)

type BoundingBox ¶

type BoundingBox struct {
	X          float64 `json:"x"`           // Left edge in points
	Y          float64 `json:"y"`           // Bottom edge in points (PDF coordinate system)
	Width      float64 `json:"width"`       // Width in points
	Height     float64 `json:"height"`      // Height in points
	PageWidth  float64 `json:"page_width"`  // Page width for relative calculations
	PageHeight float64 `json:"page_height"` // Page height for relative calculations
}

BoundingBox represents the position and size of content on a page

func (BoundingBox) RelativePosition ¶

func (b BoundingBox) RelativePosition() (xPct, yPct, wPct, hPct float64)

RelativePosition returns position as percentages of page dimensions

type Chunk ¶

type Chunk struct {
	Content    string `json:"content"`
	StartIdx   int    `json:"start_idx"`
	EndIdx     int    `json:"end_idx"`
	TokenCount int    `json:"token_count"`
}

Chunk represents a portion of content with token information

func ChunkContent ¶

func ChunkContent(content string, opts ChunkOptions) []Chunk

ChunkContent splits content into LLM-friendly chunks based on the provided options

type ChunkOptions ¶

type ChunkOptions struct {
	MaxTokens     int    `json:"max_tokens"`     // Maximum tokens per chunk (e.g., 512, 1024)
	OverlapTokens int    `json:"overlap_tokens"` // Token overlap between chunks
	ChunkBy       string `json:"chunk_by"`       // "paragraph", "sentence", "tokens"
}

ChunkOptions configures how content is chunked for LLM context windows

func DefaultChunkOptions ¶

func DefaultChunkOptions() ChunkOptions

DefaultChunkOptions returns sensible defaults for chunking

type ContentBlock ¶

type ContentBlock struct {
	ID       string       `json:"id"`                 // Unique block ID (e.g., "blk_001")
	Type     BlockType    `json:"type"`               // text, heading, image, etc.
	Content  string       `json:"content"`            // Text content or image path
	Page     int          `json:"page"`               // 1-indexed page number
	BBox     BoundingBox  `json:"bbox"`               // Position on page
	Font     *FontInfo    `json:"font,omitempty"`     // Font info for text
	Semantic SemanticInfo `json:"semantic,omitempty"` // AI-friendly metadata
	Children []string     `json:"children,omitempty"` // Child block IDs for hierarchy
}

ContentBlock represents a unit of content with position and metadata

type ContextResult ¶

type ContextResult struct {
	DocumentID string         `json:"document_id"`
	CenterID   string         `json:"center_id"` // The block we're getting context for
	Before     []ContentBlock `json:"before"`    // Blocks before
	Center     ContentBlock   `json:"center"`    // The center block
	After      []ContentBlock `json:"after"`     // Blocks after
}

ContextResult contains content blocks around a specific block

type CustomData ¶

type CustomData struct {
	Source      string            `json:"source"` // Source identifier (e.g., "crm", "faq")
	Name        string            `json:"name"`   // Display name
	Description string            `json:"description,omitempty"`
	Tags        map[string]string `json:"tags,omitempty"`        // Filter-only tags (not searched)
	Entries     []DataEntry       `json:"entries"`               // Data entries to index
	ImportedAt  time.Time         `json:"imported_at,omitempty"` // When data was imported (for incremental updates)
	ExternalID  string            `json:"external_id,omitempty"` // Unique ID from source system (for upsert)
	Images      []CustomImage     `json:"images,omitempty"`      // Document-level images (not tied to specific entry)
}

CustomData represents structured data to be indexed

type CustomDataError ¶

type CustomDataError struct {
	Source  string // Source identifier
	Message string // What went wrong
	Err     error  // Underlying error
}

CustomDataError indicates a custom data processing error

func NewCustomDataError ¶

func NewCustomDataError(source, message string, err error) *CustomDataError

NewCustomDataError creates a new CustomDataError

func (*CustomDataError) Error ¶

func (e *CustomDataError) Error() string

func (*CustomDataError) Unwrap ¶

func (e *CustomDataError) Unwrap() error

type CustomImage ¶ added in v0.0.5

type CustomImage struct {
	Data         []byte `json:"-"`                       // Image bytes (required, excluded from JSON)
	Format       string `json:"format"`                  // "png", "jpeg", "gif", "bmp" (required)
	Width        int    `json:"width,omitempty"`         // Optional, auto-detected if not provided
	Height       int    `json:"height,omitempty"`        // Optional, auto-detected if not provided
	OriginalName string `json:"original_name,omitempty"` // Optional display name
	Description  string `json:"description,omitempty"`   // AI-friendly alt text/description
}

CustomImage represents an image to be indexed with custom data

type DOCXError ¶

type DOCXError struct {
	Part    string // Which part of the DOCX (e.g., "word/document.xml")
	Message string // What went wrong
	Err     error  // Underlying error
}

DOCXError indicates a DOCX parsing or processing error

func (*DOCXError) Error ¶

func (e *DOCXError) Error() string

func (*DOCXError) Unwrap ¶

func (e *DOCXError) Unwrap() error

type DataEntry ¶

type DataEntry struct {
	ID       string            `json:"id,omitempty"`       // Optional, auto-generated if empty
	Content  string            `json:"content"`            // Text content to index/embed
	Type     string            `json:"type,omitempty"`     // "text" (default), "json", "code"
	Metadata map[string]string `json:"metadata,omitempty"` // Entry-specific metadata
	Images   []CustomImage     `json:"images,omitempty"`   // Images associated with this entry
}

DataEntry represents a single entry in custom data

type DatabaseInfo ¶ added in v0.0.5

type DatabaseInfo struct {
	SchemaVersion  int       `json:"schema_version"`  // Current schema version
	LibraryVersion string    `json:"library_version"` // Library version that created/migrated DB
	CreatedAt      time.Time `json:"created_at"`      // When database was created
	LastMigration  time.Time `json:"last_migration"`  // When last migration was applied
}

DatabaseInfo contains information about the database schema and version

type DateRange ¶

type DateRange struct {
	Start time.Time `json:"start"`
	End   time.Time `json:"end"`
}

DateRange represents a time range for filtering

type DedupResult ¶

type DedupResult struct {
	IsDuplicate  bool    `json:"is_duplicate"`
	ExistingID   string  `json:"existing_id,omitempty"`
	ExistingName string  `json:"existing_name,omitempty"`
	Similarity   float64 `json:"similarity"`
	Method       string  `json:"method"` // "checksum", "content_hash", "embedding"
}

DedupResult contains information about duplicate detection

type Document ¶

type Document struct {
	Info    DocumentInfo    `json:"info"`
	Content DocumentContent `json:"content"`
}

Document represents a fully indexed document

func (*Document) GetBlockByID ¶

func (d *Document) GetBlockByID(id string) *ContentBlock

GetBlockByID finds a block by its ID

func (*Document) GetBlocksByPage ¶

func (d *Document) GetBlocksByPage(page int) []ContentBlock

GetBlocksByPage returns blocks for a specific page

func (*Document) GetImageBlocks ¶

func (d *Document) GetImageBlocks() []ContentBlock

GetImageBlocks returns only image-type blocks

func (*Document) GetTextBlocks ¶

func (d *Document) GetTextBlocks() []ContentBlock

GetTextBlocks returns only text-type blocks

type DocumentContent ¶

type DocumentContent struct {
	Version string         `json:"version"` // Schema version
	Blocks  []ContentBlock `json:"blocks"`  // All content blocks
}

DocumentContent holds the structured content of a document

type DocumentFormat ¶

type DocumentFormat string

DocumentFormat represents the source document format

const (
	FormatPDF        DocumentFormat = "pdf"
	FormatDOCX       DocumentFormat = "docx"
	FormatCustomData DocumentFormat = "customdata" // Custom data source
)

type DocumentInfo ¶

type DocumentInfo struct {
	ID           string         `json:"id"`                    // UUID
	Name         string         `json:"name"`                  // Original filename
	OriginalPath string         `json:"original_path"`         // Path when indexed
	SizeBytes    int64          `json:"size_bytes"`            // File size
	PageCount    int            `json:"page_count"`            // Number of pages
	Format       DocumentFormat `json:"format"`                // pdf, docx, customdata
	Checksum     string         `json:"checksum"`              // SHA-256 hash
	CreatedAt    time.Time      `json:"created_at"`            // When indexed
	UpdatedAt    time.Time      `json:"updated_at"`            // Last update
	Source       string         `json:"source,omitempty"`      // CustomData source identifier
	Description  string         `json:"description,omitempty"` // CustomData description
	ImportedAt   time.Time      `json:"imported_at,omitempty"` // CustomData import timestamp
	ExternalID   string         `json:"external_id,omitempty"` // External identifier for upsert
}

DocumentInfo contains metadata about an indexed document

type EmbeddingStatus ¶

type EmbeddingStatus struct {
	HasEmbeddings   bool      `json:"has_embeddings"`         // True if any embeddings exist
	IsComplete      bool      `json:"is_complete"`            // True if all embeddable blocks have vectors
	EmbeddedCount   int       `json:"embedded_count"`         // Number of blocks with embeddings
	TotalEmbeddable int       `json:"total_embeddable"`       // Number of blocks that can be embedded
	Model           string    `json:"model,omitempty"`        // Embedding model used
	Dimension       int       `json:"dimension,omitempty"`    // Vector dimension
	LastUpdated     time.Time `json:"last_updated,omitempty"` // When embeddings were last updated
}

EmbeddingStatus contains information about a document's embedding state

func (*EmbeddingStatus) Progress ¶

func (e *EmbeddingStatus) Progress() float64

Progress returns embedding completion as a percentage (0-100)

type Filter ¶

type Filter struct {
	// contains filtered or unexported fields
}

Filter provides a fluent API for building search filters

func (*Filter) After ¶

func (f *Filter) After(t time.Time) *Filter

After filters documents created/imported after the given time

func (*Filter) Before ¶

func (f *Filter) Before(t time.Time) *Filter

Before filters documents created/imported before the given time

func (*Filter) Build ¶

func (f *Filter) Build() *FilterConfig

Build converts the Filter to FilterConfig for internal use

func (*Filter) DateRange ¶

func (f *Filter) DateRange(start, end time.Time) *Filter

DateRange filters documents created/imported within a time range

func (*Filter) ExternalIDs ¶

func (f *Filter) ExternalIDs(ids ...string) *Filter

ExternalIDs filters by external identifiers

func (*Filter) Formats ¶

func (f *Filter) Formats(formats ...string) *Filter

Formats filters by document format (e.g., "pdf", "docx", "customdata")

func (*Filter) GetDateRange ¶

func (f *Filter) GetDateRange() *DateRange

GetDateRange returns the date range filter

func (*Filter) GetExternalIDs ¶

func (f *Filter) GetExternalIDs() []string

GetExternalIDs returns the external ID filters

func (*Filter) GetFormats ¶

func (f *Filter) GetFormats() []string

GetFormats returns the format filters

func (*Filter) GetHasEmbeddings ¶

func (f *Filter) GetHasEmbeddings() *bool

GetHasEmbeddings returns the embeddings filter

func (*Filter) GetMaxPageCount ¶

func (f *Filter) GetMaxPageCount() int

GetMaxPageCount returns the maximum page count filter

func (*Filter) GetMinPageCount ¶

func (f *Filter) GetMinPageCount() int

GetMinPageCount returns the minimum page count filter

func (*Filter) GetSources ¶

func (f *Filter) GetSources() []string

GetSources returns the source filters

func (*Filter) GetTags ¶

func (f *Filter) GetTags() map[string]string

GetTags returns the tag filters

func (*Filter) HasEmbeddings ¶

func (f *Filter) HasEmbeddings(has bool) *Filter

HasEmbeddings filters documents that have (or don't have) embeddings

func (*Filter) IsEmpty ¶

func (f *Filter) IsEmpty() bool

IsEmpty returns true if no filters are set

func (*Filter) MaxPages ¶

func (f *Filter) MaxPages(n int) *Filter

MaxPages filters documents with at most n pages

func (*Filter) MinPages ¶

func (f *Filter) MinPages(n int) *Filter

MinPages filters documents with at least n pages

func (*Filter) Sources ¶

func (f *Filter) Sources(sources ...string) *Filter

Sources filters by source identifiers (e.g., "crm", "faq")

func (*Filter) Tag ¶

func (f *Filter) Tag(key, value string) *Filter

Tag adds a single tag filter

func (*Filter) Tags ¶

func (f *Filter) Tags(tags map[string]string) *Filter

Tags filters by multiple tags (AND logic - all must match)

type FilterConfig ¶

type FilterConfig struct {
	Sources       []string
	Formats       []string
	Tags          map[string]string
	DateStart     time.Time
	DateEnd       time.Time
	MinPageCount  int
	MaxPageCount  int
	HasEmbeddings *bool
	ExternalIDs   []string
}

FilterConfig is the internal representation used by search

type FontError ¶

type FontError struct {
	FontName string // Font name
	Message  string // What went wrong
	Err      error  // Underlying error
}

FontError indicates an error processing a font

func (*FontError) Error ¶

func (e *FontError) Error() string

func (*FontError) Unwrap ¶

func (e *FontError) Unwrap() error

type FontInfo ¶

type FontInfo struct {
	Name   string  `json:"name"`             // Font name (e.g., "Helvetica-Bold")
	Size   float64 `json:"size"`             // Font size in points
	Bold   bool    `json:"bold,omitempty"`   // Is bold
	Italic bool    `json:"italic,omitempty"` // Is italic
}

FontInfo contains font metadata for text content

type HNSWConfig ¶ added in v0.0.6

type HNSWConfig struct {
	M        int // Max connections per layer (default: 16, range: 4-64)
	EfConst  int // Construction ef parameter (default: 200, range: 10-500)
	EfSearch int // Search ef parameter (default: 50, range: 10-500)
}

HNSWConfig configures the HNSW vector index parameters

type ImageInfo ¶

type ImageInfo struct {
	ID           string `json:"id"`                      // Image UUID
	DocumentID   string `json:"document_id,omitempty"`   // Parent document ID
	BlockID      string `json:"block_id,omitempty"`      // Associated content block ID
	Format       string `json:"format"`                  // png, jpeg, etc.
	Width        int    `json:"width"`                   // Image width in pixels
	Height       int    `json:"height"`                  // Image height in pixels
	Page         int    `json:"page"`                    // Page number
	OriginalName string `json:"original_name,omitempty"` // Original image name from PDF/DOCX
}

ImageInfo contains metadata about an extracted image

func WithDeferEmbedding ¶ added in v0.0.6

func WithDeferEmbedding(defer_ bool) IndexOption

WithDeferEmbedding skips embedding generation during indexing. Use store.EmbedPendingDocuments() or store.EmbedDocuments() to generate embeddings later in a batch operation. This is recommended for bulk imports. See OPTIMISATIONS.md for recommended patterns.

func WithForceReindex ¶

func WithForceReindex(force bool) IndexOption

WithForceReindex forces re-indexing even if document exists

func WithIndexSource ¶ added in v0.0.13

func WithIndexSource(source string) IndexOption

WithIndexSource sets a custom source identifier for the document. This overrides the default format-based source ("pdf" or "docx"). Use this for logical categorization like "knowledgebase", "manual", etc.

func WithIndexTags ¶ added in v0.0.13

func WithIndexTags(tags map[string]string) IndexOption

WithIndexTags sets metadata tags for the document. Tags enable filtering in search queries via WithTags() search option.

func WithName ¶

func WithName(name string) IndexOption

WithName overrides the document name

func WithProgressCallback ¶

func WithProgressCallback(fn ProgressCallback) IndexOption

WithProgressCallback sets a callback for progress updates during indexing

func WithSourcePath ¶

func WithSourcePath(path string) IndexOption

WithSourcePath sets the original source path for metadata

type IndexProgress ¶

type IndexProgress struct {
	DocumentID      string        `json:"document_id"`
	DocumentName    string        `json:"document_name"`
	Status          string        `json:"status"` // "parsing", "extracting", "indexing", "embedding", "complete", "error"
	TotalPages      int           `json:"total_pages"`
	ProcessedPages  int           `json:"processed_pages"`
	TotalBlocks     int           `json:"total_blocks"`
	ProcessedBlocks int           `json:"processed_blocks"`
	Error           error         `json:"error,omitempty"`
	StartTime       time.Time     `json:"start_time"`
	ElapsedTime     time.Duration `json:"elapsed_time"`
}

IndexProgress reports progress during document indexing

type ObjectError ¶

type ObjectError struct {
	ObjectNum int    // Object number
	GenNum    int    // Generation number
	Message   string // What went wrong
	Err       error  // Underlying error
}

ObjectError indicates an error with a specific PDF object

func NewObjectError ¶

func NewObjectError(objNum, genNum int, message string, err error) *ObjectError

NewObjectError creates a new ObjectError

func (*ObjectError) Error ¶

func (e *ObjectError) Error() string

func (*ObjectError) Unwrap ¶

func (e *ObjectError) Unwrap() error

type Page ¶

type Page struct {
	Number int            `json:"number"`
	Width  float64        `json:"width"`
	Height float64        `json:"height"`
	Blocks []ContentBlock `json:"blocks"`
}

Page represents a single page with its content

type PageError ¶

type PageError struct {
	PageNum int    // 1-indexed page number
	Message string // What went wrong
	Err     error  // Underlying error
}

PageError indicates an error processing a specific page

func NewPageError ¶

func NewPageError(pageNum int, message string, err error) *PageError

NewPageError creates a new PageError

func (*PageError) Error ¶

func (e *PageError) Error() string

func (*PageError) Unwrap ¶

func (e *PageError) Unwrap() error

type ParseError ¶

type ParseError struct {
	Op      string // Operation that failed (e.g., "lexer.readToken")
	Offset  int64  // Byte offset in file where error occurred
	Message string // Human-readable message
	Err     error  // Underlying error
}

ParseError provides detailed information about PDF parsing failures

func NewParseError ¶

func NewParseError(op string, offset int64, message string, err error) *ParseError

NewParseError creates a new ParseError

func (*ParseError) Error ¶

func (e *ParseError) Error() string

func (*ParseError) Unwrap ¶

func (e *ParseError) Unwrap() error

type Posting ¶

type Posting struct {
	DocumentID string  `json:"doc_id"`
	BlockID    string  `json:"block_id"`
	Positions  []int   `json:"positions"` // Positions within the block text
	TF         float64 `json:"tf"`        // Term frequency for this posting
}

Posting represents a term occurrence in the index

type ProgressCallback ¶

type ProgressCallback func(IndexProgress)

ProgressCallback is called during document indexing to report progress

type QueryType ¶

type QueryType string

QueryType represents the detected intent of a search query

const (
	// QueryTypeFactual for questions like "What is X?", "Who is Y?"
	QueryTypeFactual QueryType = "factual"
	// QueryTypeNavigation for "Show me section...", "Find..."
	QueryTypeNavigation QueryType = "navigation"
	// QueryTypeSummary for "Summarize...", "Overview of..."
	QueryTypeSummary QueryType = "summary"
	// QueryTypeComparison for "Compare X and Y", "Difference between..."
	QueryTypeComparison QueryType = "comparison"
	// QueryTypeDefinition for "Define X", "What is the definition of..."
	QueryTypeDefinition QueryType = "definition"
	// QueryTypeList for "List all X", "Enumerate...", "What are all..."
	QueryTypeList QueryType = "list"
	// QueryTypeUnknown when intent cannot be determined
	QueryTypeUnknown QueryType = "unknown"
)

func DetectQueryType ¶

func DetectQueryType(query string) QueryType

DetectQueryType analyzes a search query and returns its detected intent

type SearchDiagnostics ¶ added in v0.0.14

type SearchDiagnostics struct {
	KeywordResults  int           `json:"keyword_results"`   // Results from BM25 keyword search
	VectorResults   int           `json:"vector_results"`    // Results from vector/semantic search
	KeywordTime     time.Duration `json:"keyword_time"`      // Time spent on keyword search
	VectorTime      time.Duration `json:"vector_time"`       // Time spent on vector search
	FusionTime      time.Duration `json:"fusion_time"`       // Time spent fusing results
	FilteredByScore int           `json:"filtered_by_score"` // Results filtered by MinScore
	DiversifiedFrom int           `json:"diversified_from"`  // Results before diversification (0 if not applied)
}

SearchDiagnostics provides detailed information about how search was executed. Useful for debugging and optimizing search performance.

type SearchError ¶

type SearchError struct {
	Query   string // The search query
	Message string // What went wrong
	Err     error  // Underlying error
}

SearchError indicates a search operation failure

func NewSearchError ¶

func NewSearchError(query, message string, err error) *SearchError

NewSearchError creates a new SearchError

func (*SearchError) Error ¶

func (e *SearchError) Error() string

func (*SearchError) Unwrap ¶

func (e *SearchError) Unwrap() error

type SearchMode ¶

type SearchMode string

SearchMode defines the type of search

const (
	// SearchModeKeyword uses BM25 keyword search only
	SearchModeKeyword SearchMode = "keyword"
	// SearchModeSemantic uses vector similarity search only
	SearchModeSemantic SearchMode = "semantic"
	// SearchModeHybrid combines BM25 and vector search with RRF fusion
	SearchModeHybrid SearchMode = "hybrid"
)

func SuggestedSearchMode ¶

func SuggestedSearchMode(qt QueryType) SearchMode

SuggestedSearchMode returns the recommended search mode for a query type

type SearchOption ¶

type SearchOption func(*searchConfig)

SearchOption configures search behavior

func WithAgentOutput ¶

func WithAgentOutput(enabled bool) SearchOption

WithAgentOutput enables agent-friendly output format

func WithChunking ¶

func WithChunking(opts ChunkOptions) SearchOption

WithChunking configures result chunking for LLM context windows

func WithCitations ¶

func WithCitations(enabled bool) SearchOption

WithCitations adds citation references [1], [2], etc. to results

func WithContextWindow ¶

func WithContextWindow(blocks int) SearchOption

WithContextWindow sets the number of surrounding blocks to include

func WithDiagnostics ¶ added in v0.0.14

func WithDiagnostics(enabled bool) SearchOption

WithDiagnostics enables detailed search diagnostics in the results. Diagnostics include timing breakdowns, result counts per search type, and filtering stats. Useful for debugging and optimizing search performance.

func WithDiversify ¶ added in v0.0.14

func WithDiversify(maxPerDoc int) SearchOption

WithDiversify limits the number of results per document to improve variety. This is useful when multiple blocks from the same document match the query. Set maxPerDoc to 0 (default) for unlimited results per document.

func WithDocuments ¶

func WithDocuments(docIDs ...string) SearchOption

WithDocuments limits search to specific documents

func WithEfSearch ¶ added in v0.0.11

func WithEfSearch(ef int) SearchOption

WithEfSearch overrides the HNSW efSearch parameter for this query. Higher values improve recall at the cost of latency. If ef <= 0 (default), the store's configured EfSearch value is used. Recommended: 50 for speed, 100 for balanced, 200+ for high recall.

func WithEstimateTokens ¶

func WithEstimateTokens(enabled bool) SearchOption

WithEstimateTokens includes token count estimates in results

func WithFilter ¶

func WithFilter(f *Filter) SearchOption

WithFilter applies an advanced filter DSL

func WithHighlight ¶

func WithHighlight(pre, post string) SearchOption

WithHighlight sets the highlight markers for matched terms

func WithImages ¶

func WithImages(include bool) SearchOption

WithImages includes image blocks in search results

func WithKeywordWeight ¶

func WithKeywordWeight(weight float64) SearchOption

WithKeywordWeight sets the weight for keyword search in hybrid mode (0-1)

func WithMaxResults ¶

func WithMaxResults(n int) SearchOption

WithMaxResults sets the maximum number of results

func WithMetadata ¶ added in v0.0.12

func WithMetadata(include bool) SearchOption

WithMetadata includes document metadata (tags, source, external ID) in search results. This adds extra database queries per result, so it's disabled by default for performance.

func WithMinScore ¶

func WithMinScore(score float64) SearchOption

WithMinScore sets the minimum relevance score threshold

func WithPageRange ¶

func WithPageRange(start, end int) SearchOption

WithPageRange limits search to a specific page range

func WithSearchMode ¶

func WithSearchMode(mode SearchMode) SearchOption

WithSearchMode sets the search mode (keyword, semantic, or hybrid)

func WithSections ¶

func WithSections(sections ...string) SearchOption

WithSections limits search to specific sections

func WithSources ¶

func WithSources(sources ...string) SearchOption

WithSources filters search results by source or format (e.g., "pdf", "docx", "crm")

func WithTags ¶

func WithTags(tags map[string]string) SearchOption

WithTags filters search results by tags (AND logic - all must match)

func WithVectorWeight ¶

func WithVectorWeight(weight float64) SearchOption

WithVectorWeight sets the weight for vector search in hybrid mode (0-1)

type SearchResult ¶

type SearchResult struct {
	DocumentID   string            `json:"document_id"`
	DocumentName string            `json:"document_name"`
	BlockID      string            `json:"block_id"`
	Content      string            `json:"content"`               // Matched content
	Snippet      string            `json:"snippet"`               // Highlighted snippet
	Score        float64           `json:"score"`                 // Relevance score
	Page         int               `json:"page"`                  // Page number
	Section      string            `json:"section"`               // Section name
	Context      []ContentBlock    `json:"context"`               // Surrounding blocks for RAG
	Positions    []int             `json:"positions"`             // Match positions in content
	Images       []string          `json:"images,omitempty"`      // Image paths in same section
	Tags         map[string]string `json:"tags,omitempty"`        // Document tags
	Source       string            `json:"source,omitempty"`      // Source identifier (for CustomData)
	ExternalID   string            `json:"external_id,omitempty"` // External system ID
}

SearchResult represents a single search hit

type SearchResults ¶

type SearchResults struct {
	Query       string             `json:"query"`
	TotalHits   int                `json:"total_hits"`
	Results     []SearchResult     `json:"results"`
	SearchTime  time.Duration      `json:"search_time"`
	Diagnostics *SearchDiagnostics `json:"diagnostics,omitempty"`
}

SearchResults contains search results with metadata

type SemanticInfo ¶

type SemanticInfo struct {
	IsHeading    bool     `json:"is_heading,omitempty"`
	HeadingLevel int      `json:"heading_level,omitempty"` // 1-6 like HTML
	Section      string   `json:"section,omitempty"`       // Parent section title
	Keywords     []string `json:"keywords,omitempty"`      // Extracted keywords
	Context      string   `json:"context,omitempty"`       // Surrounding context summary
}

SemanticInfo contains AI-friendly metadata about content

type StorageError ¶

type StorageError struct {
	Op      string // Operation (e.g., "write", "read", "delete")
	Path    string // File or directory path
	Message string // What went wrong
	Err     error  // Underlying error
}

StorageError indicates a storage operation failure

func NewStorageError ¶

func NewStorageError(op, path, message string, err error) *StorageError

NewStorageError creates a new StorageError

func (*StorageError) Error ¶

func (e *StorageError) Error() string

func (*StorageError) Unwrap ¶

func (e *StorageError) Unwrap() error

type Store ¶

type Store struct {
	// contains filtered or unexported fields
}

Store manages document storage, indexing, and search with unified SQLite backend

func NewStore ¶

func NewStore(basePath string, opts ...StoreOption) (*Store, error)

NewStore creates a new document store at the specified path using SQLite

func (*Store) CancelBackground ¶ added in v0.0.9

func (s *Store) CancelBackground()

CancelBackground cancels background embedding if running. The operation will stop after the current document completes.

func (*Store) CheckDuplicate ¶

func (s *Store) CheckDuplicate(path string) (*DedupResult, error)

CheckDuplicate checks if a document at the given path is a duplicate of an existing document. It uses file checksum for comparison.

func (*Store) CheckDuplicateByContent ¶

func (s *Store) CheckDuplicateByContent(data []byte) (*DedupResult, error)

CheckDuplicateByContent checks if content already exists in the store. It computes a content hash from the provided data and checks for matches.

func (*Store) CheckHealth ¶ added in v0.0.8

func (s *Store) CheckHealth() (*StoreHealth, error)

CheckHealth performs a comprehensive consistency check on the store. It checks for HNSW-SQLite synchronization, incomplete embeddings, and other issues.

func (*Store) Close ¶

func (s *Store) Close() error

Close releases resources held by the store

func (*Store) DatabaseInfo ¶ added in v0.0.5

func (s *Store) DatabaseInfo() (*DatabaseInfo, error)

DatabaseInfo returns information about the database schema and version

func (*Store) DeleteDocument ¶

func (s *Store) DeleteDocument(id string) error

DeleteDocument removes a document from the store

func (*Store) DetectQueryType ¶

func (s *Store) DetectQueryType(query string) QueryType

DetectQueryType analyzes a query string and returns its detected intent type

func (*Store) EmbedDocuments ¶ added in v0.0.6

func (s *Store) EmbedDocuments(docIDs ...string) error

EmbedDocuments generates embeddings for specific documents by ID. This is useful for resumable batch processing. OPTIMIZED: Saves HNSW index only once at the end instead of after each document.

func (*Store) EmbedPendingDocuments ¶ added in v0.0.6

func (s *Store) EmbedPendingDocuments() error

EmbedPendingDocuments generates embeddings for all documents that don't have them yet. This is the main method for deferred embedding patterns. OPTIMIZED: Saves HNSW index only once at the end instead of after each document.

func (*Store) EmbedPendingDocumentsAsync ¶ added in v0.0.9

func (s *Store) EmbedPendingDocumentsAsync() error

EmbedPendingDocumentsAsync starts embedding in background. Returns immediately. Use GetBackgroundStatus() to check progress, IsBackgroundRunning() to check if still running, or WaitForBackground() to block.

func (*Store) FindByExternalID ¶ added in v0.0.11

func (s *Store) FindByExternalID(source, externalID string) (*Document, error)

FindByExternalID finds a document by source and external ID

func (*Store) GetBackgroundStatus ¶ added in v0.0.9

func (s *Store) GetBackgroundStatus() BackgroundEmbeddingStatus

GetBackgroundStatus returns the current status of background embedding. Safe to call even when no background operation is running.

func (*Store) GetContext ¶

func (s *Store) GetContext(docID, blockID string, windowSize int) (*ContextResult, error)

GetContext retrieves content blocks around a specific block

func (*Store) GetDocument ¶

func (s *Store) GetDocument(id string) (*Document, error)

GetDocument retrieves a document by ID

func (*Store) GetDocumentsWithIncompleteEmbeddings ¶ added in v0.0.8

func (s *Store) GetDocumentsWithIncompleteEmbeddings() ([]*DocumentInfo, error)

GetDocumentsWithIncompleteEmbeddings returns documents that have some but not all blocks embedded. This identifies documents where embedding was interrupted mid-way and need recovery.

func (*Store) GetDocumentsWithoutEmbeddings ¶ added in v0.0.6

func (s *Store) GetDocumentsWithoutEmbeddings() ([]*DocumentInfo, error)

GetDocumentsWithoutEmbeddings returns documents that have embeddable content but don't have any embeddings yet. Use this for resumable maintenance tasks.

func (*Store) GetEmbeddingStatus ¶

func (s *Store) GetEmbeddingStatus(docID string) (*EmbeddingStatus, error)

GetEmbeddingStatus returns the embedding status for a document

func (*Store) GetImagesByDocumentFiltered ¶

func (s *Store) GetImagesByDocumentFiltered(docID, section string, page int) ([]ImageInfo, error)

GetImagesByDocumentFiltered returns images for a document with optional section/page filters

func (*Store) GetLastImportTime ¶

func (s *Store) GetLastImportTime(source string) (time.Time, error)

GetLastImportTime returns the most recent import timestamp for a given source Returns zero time if no imports found for the source

func (*Store) HasEmbeddings ¶

func (s *Store) HasEmbeddings(docID string) (bool, error)

HasEmbeddings is a convenience method that returns true if a document has any embeddings

func (*Store) IndexCustomData ¶

func (s *Store) IndexCustomData(data *CustomData, opts ...IndexOption) (*Document, error)

IndexCustomData indexes custom structured data

func (*Store) IndexCustomDataBatch ¶ added in v0.0.6

func (s *Store) IndexCustomDataBatch(data []*CustomData, opts ...IndexOption) ([]*Document, error)

IndexCustomDataBatch indexes multiple custom data entries efficiently. This is optimized for bulk imports with deferred global stats updates.

func (*Store) IndexDocument ¶

func (s *Store) IndexDocument(path string, opts ...IndexOption) (*Document, error)

IndexDocument indexes a document from a file path

func (*Store) IndexDocumentWithProgress ¶

func (s *Store) IndexDocumentWithProgress(path string, callback ProgressCallback, opts ...IndexOption) (*Document, error)

IndexDocumentWithProgress indexes a document with progress callbacks

func (*Store) IndexReader ¶

func (s *Store) IndexReader(r io.Reader, name string, opts ...IndexOption) (*Document, error)

IndexReader indexes a document from an io.Reader

func (*Store) IsBackgroundRunning ¶ added in v0.0.9

func (s *Store) IsBackgroundRunning() bool

IsBackgroundRunning returns true if background embedding is in progress.

func (*Store) ListDocuments ¶

func (s *Store) ListDocuments() ([]*DocumentInfo, error)

ListDocuments returns all indexed documents

func (*Store) Repair ¶ added in v0.0.8

func (s *Store) Repair() error

Repair fixes detected inconsistencies in the store. It rebuilds the HNSW index from SQLite and resumes incomplete embeddings. Returns nil if no repairs were needed or all repairs succeeded.

func (*Store) ResumeAllIncompleteEmbeddings ¶ added in v0.0.8

func (s *Store) ResumeAllIncompleteEmbeddings() error

ResumeAllIncompleteEmbeddings resumes embedding for all documents with incomplete embeddings. This is useful for recovering from crashes or interruptions during batch embedding.

func (*Store) ResumeEmbedding ¶ added in v0.0.8

func (s *Store) ResumeEmbedding(docID string) error

ResumeEmbedding continues embedding for a document that was partially embedded. Only embeds blocks that don't already have vectors, making it safe to call on documents that were interrupted during embedding.

func (*Store) Search ¶

func (s *Store) Search(query string, opts ...SearchOption) (*SearchResults, error)

Search performs a search across all documents

func (*Store) SearchForAgent ¶

func (s *Store) SearchForAgent(query string, opts ...SearchOption) (*AgentSearchResponse, error)

SearchForAgent performs a search optimized for AI agent consumption. Returns structured output with token estimates, citation references, and chunked results.

func (*Store) SearchInDocument ¶

func (s *Store) SearchInDocument(docID, query string, opts ...SearchOption) (*SearchResults, error)

SearchInDocument searches within a specific document

func (*Store) SetEmbeddingProvider ¶

func (s *Store) SetEmbeddingProvider(provider embedding.Provider) error

SetEmbeddingProvider configures the embedding provider after store creation. It automatically detects and repairs inconsistencies between the HNSW index and SQLite vectors, rebuilding the index if necessary.

func (*Store) Stats ¶

func (s *Store) Stats() StoreStats

Stats returns statistics about the store

func (*Store) UpsertCustomData ¶

func (s *Store) UpsertCustomData(data *CustomData, opts ...IndexOption) (*Document, error)

UpsertCustomData updates an existing document or creates a new one based on source + external_id. If ExternalID is provided and a document with the same source + external_id exists, it will be updated. If ExternalID is empty or no matching document exists, a new document is created.

func (*Store) WaitForBackground ¶ added in v0.0.9

func (s *Store) WaitForBackground() error

WaitForBackground blocks until background embedding completes. Returns the error from background embedding if any, or nil if successful. Returns nil immediately if no background operation is running.

type StoreHealth ¶ added in v0.0.8

type StoreHealth struct {
	IsHealthy            bool     `json:"is_healthy"`            // True if all checks pass
	HNSWSize             int      `json:"hnsw_size"`             // Number of vectors in HNSW index
	SQLiteVectorCount    int      `json:"sqlite_vector_count"`   // Number of vectors in SQLite
	HNSWSynced           bool     `json:"hnsw_synced"`           // True if HNSW matches SQLite
	IncompleteEmbeddings []string `json:"incomplete_embeddings"` // Document IDs with partial embeddings
	PendingEmbeddings    []string `json:"pending_embeddings"`    // Document IDs without any embeddings
	DocumentCount        int      `json:"document_count"`        // Total number of documents
	BlockCount           int      `json:"block_count"`           // Total number of content blocks
}

StoreHealth contains consistency check results for diagnosing store issues

type StoreOption ¶

type StoreOption func(*storeConfig)

StoreOption configures Store behavior

func WithCache ¶

func WithCache(enabled bool, size int) StoreOption

WithCache enables/disables object caching

func WithChecksum ¶

func WithChecksum(enabled bool) StoreOption

WithChecksum enables/disables document checksum computation

func WithDedupCheck ¶

func WithDedupCheck(enabled bool) StoreOption

WithDedupCheck enables duplicate detection before indexing

func WithHNSWConfig ¶ added in v0.0.6

func WithHNSWConfig(cfg HNSWConfig) StoreOption

WithHNSWConfig configures the HNSW vector index parameters. Use this to tune performance vs quality trade-offs for bulk imports. See OPTIMISATIONS.md for recommended settings by dataset size.

func WithImageExtraction ¶

func WithImageExtraction(enabled bool) StoreOption

WithImageExtraction enables/disables image extraction

func WithMaxConcurrency ¶

func WithMaxConcurrency(n int) StoreOption

WithMaxConcurrency sets the maximum concurrent operations

func WithNGrams ¶

func WithNGrams(enabled bool, size int) StoreOption

WithNGrams enables n-gram indexing for fuzzy search

func WithSemanticAnalysis ¶

func WithSemanticAnalysis(enabled bool) StoreOption

WithSemanticAnalysis enables/disables semantic analysis

func WithStemming ¶

func WithStemming(enabled bool) StoreOption

WithStemming enables/disables Porter stemming in search

func WithStopWords ¶

func WithStopWords(enabled bool) StoreOption

WithStopWords enables/disables stop word filtering

type StoreStats ¶

type StoreStats struct {
	DocumentCount int   `json:"document_count"`
	TotalBlocks   int   `json:"total_blocks"`
	TotalImages   int   `json:"total_images"`
	IndexTerms    int   `json:"index_terms"`
	VectorCount   int   `json:"vector_count"`
	StorageBytes  int64 `json:"storage_bytes"`
}

StoreStats contains statistics about the store

type StreamError ¶

type StreamError struct {
	Filter  string // Filter name (e.g., "FlateDecode")
	Message string // What went wrong
	Err     error  // Underlying error
}

StreamError indicates an error decoding a stream

func NewStreamError ¶

func NewStreamError(filter, message string, err error) *StreamError

NewStreamError creates a new StreamError

func (*StreamError) Error ¶

func (e *StreamError) Error() string

func (*StreamError) Unwrap ¶

func (e *StreamError) Unwrap() error

type TermEntry ¶

type TermEntry struct {
	Term     string    `json:"term"`
	DF       int       `json:"df"`       // Document frequency
	Postings []Posting `json:"postings"` // All occurrences
}

TermEntry contains all postings for a term

type TokenBudget ¶

type TokenBudget struct {
	MaxTokens  int
	UsedTokens int
}

TokenBudget helps track token usage across multiple operations

func NewTokenBudget ¶

func NewTokenBudget(maxTokens int) *TokenBudget

NewTokenBudget creates a new token budget tracker

func (*TokenBudget) Add ¶

func (b *TokenBudget) Add(tokens int) bool

Add adds tokens to the budget, returns true if within budget

func (*TokenBudget) AddText ¶

func (b *TokenBudget) AddText(text string) bool

AddText estimates and adds tokens for text, returns true if within budget

func (*TokenBudget) IsExhausted ¶

func (b *TokenBudget) IsExhausted() bool

IsExhausted returns true if budget is exhausted

func (*TokenBudget) Remaining ¶

func (b *TokenBudget) Remaining() int

Remaining returns remaining token budget

func (*TokenBudget) Reset ¶

func (b *TokenBudget) Reset()

Reset resets the budget to zero usage

func (*TokenBudget) Usage ¶

func (b *TokenBudget) Usage() float64

Usage returns the current usage percentage (0-100)

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
cmd
test_pdf command
docx
embedding
internal
nlp Package nlp provides natural language processing utilities for text analysis.	Package nlp provides natural language processing utilities for text analysis.
pdf
encoding
search Package search provides hybrid search functionality combining keyword and vector search.	Package search provides hybrid search functionality combining keyword and vector search.
sqlite
vectorindex

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL