Documentation
¶
Index ¶
- Constants
- Variables
- func ChunkBlocks(blocks []ContentBlock, opts ChunkOptions) [][]ContentBlock
- func ChunkSearchResults(results []SearchResult, maxTokens int) [][]SearchResult
- func CombineChunkedBlocks(blocks []ContentBlock, separator string) string
- func EstimateAgentResultTokens(results []AgentSearchResult) int
- func EstimateBlockTokens(block *ContentBlock) int
- func EstimateContextTokens(ctx *ContextResult) int
- func EstimateResultTokens(results []SearchResult) int
- func EstimateTokens(text string) int
- func FitsInContext(text string, maxTokens int) bool
- func IsCustomDataError(err error) bool
- func IsDOCXError(err error) bool
- func IsObjectError(err error) bool
- func IsParseError(err error) bool
- func IsSearchError(err error) bool
- func IsStorageError(err error) bool
- func QueryTypeDescription(qt QueryType) string
- func TruncateToTokenLimit(text string, maxTokens int) string
- type AgentSearchResponse
- type AgentSearchResult
- type BackgroundEmbeddingStatus
- type BlockType
- type BoundingBox
- type Chunk
- type ChunkOptions
- type ContentBlock
- type ContextResult
- type CustomData
- type CustomDataError
- type CustomImage
- type DOCXError
- type DataEntry
- type DatabaseInfo
- type DateRange
- type DedupResult
- type Document
- type DocumentContent
- type DocumentFormat
- type DocumentInfo
- type EmbeddingStatus
- type Filter
- func (f *Filter) After(t time.Time) *Filter
- func (f *Filter) Before(t time.Time) *Filter
- func (f *Filter) Build() *FilterConfig
- func (f *Filter) DateRange(start, end time.Time) *Filter
- func (f *Filter) ExternalIDs(ids ...string) *Filter
- func (f *Filter) Formats(formats ...string) *Filter
- func (f *Filter) GetDateRange() *DateRange
- func (f *Filter) GetExternalIDs() []string
- func (f *Filter) GetFormats() []string
- func (f *Filter) GetHasEmbeddings() *bool
- func (f *Filter) GetMaxPageCount() int
- func (f *Filter) GetMinPageCount() int
- func (f *Filter) GetSources() []string
- func (f *Filter) GetTags() map[string]string
- func (f *Filter) HasEmbeddings(has bool) *Filter
- func (f *Filter) IsEmpty() bool
- func (f *Filter) MaxPages(n int) *Filter
- func (f *Filter) MinPages(n int) *Filter
- func (f *Filter) Sources(sources ...string) *Filter
- func (f *Filter) Tag(key, value string) *Filter
- func (f *Filter) Tags(tags map[string]string) *Filter
- type FilterConfig
- type FontError
- type FontInfo
- type HNSWConfig
- type ImageInfo
- type IndexOption
- func WithDeferEmbedding(defer_ bool) IndexOption
- func WithForceReindex(force bool) IndexOption
- func WithIndexSource(source string) IndexOption
- func WithIndexTags(tags map[string]string) IndexOption
- func WithName(name string) IndexOption
- func WithProgressCallback(fn ProgressCallback) IndexOption
- func WithSourcePath(path string) IndexOption
- type IndexProgress
- type ObjectError
- type Page
- type PageError
- type ParseError
- type Posting
- type ProgressCallback
- type QueryType
- type SearchDiagnostics
- type SearchError
- type SearchMode
- type SearchOption
- func WithAgentOutput(enabled bool) SearchOption
- func WithChunking(opts ChunkOptions) SearchOption
- func WithCitations(enabled bool) SearchOption
- func WithContextWindow(blocks int) SearchOption
- func WithDiagnostics(enabled bool) SearchOption
- func WithDiversify(maxPerDoc int) SearchOption
- func WithDocuments(docIDs ...string) SearchOption
- func WithEfSearch(ef int) SearchOption
- func WithEstimateTokens(enabled bool) SearchOption
- func WithFilter(f *Filter) SearchOption
- func WithHighlight(pre, post string) SearchOption
- func WithImages(include bool) SearchOption
- func WithKeywordWeight(weight float64) SearchOption
- func WithMaxResults(n int) SearchOption
- func WithMetadata(include bool) SearchOption
- func WithMinScore(score float64) SearchOption
- func WithPageRange(start, end int) SearchOption
- func WithSearchMode(mode SearchMode) SearchOption
- func WithSections(sections ...string) SearchOption
- func WithSources(sources ...string) SearchOption
- func WithTags(tags map[string]string) SearchOption
- func WithVectorWeight(weight float64) SearchOption
- type SearchResult
- type SearchResults
- type SemanticInfo
- type StorageError
- type Store
- func (s *Store) CancelBackground()
- func (s *Store) CheckDuplicate(path string) (*DedupResult, error)
- func (s *Store) CheckDuplicateByContent(data []byte) (*DedupResult, error)
- func (s *Store) CheckHealth() (*StoreHealth, error)
- func (s *Store) Close() error
- func (s *Store) DatabaseInfo() (*DatabaseInfo, error)
- func (s *Store) DeleteDocument(id string) error
- func (s *Store) DetectQueryType(query string) QueryType
- func (s *Store) EmbedDocuments(docIDs ...string) error
- func (s *Store) EmbedPendingDocuments() error
- func (s *Store) EmbedPendingDocumentsAsync() error
- func (s *Store) FindByExternalID(source, externalID string) (*Document, error)
- func (s *Store) GetBackgroundStatus() BackgroundEmbeddingStatus
- func (s *Store) GetContext(docID, blockID string, windowSize int) (*ContextResult, error)
- func (s *Store) GetDocument(id string) (*Document, error)
- func (s *Store) GetDocumentsWithIncompleteEmbeddings() ([]*DocumentInfo, error)
- func (s *Store) GetDocumentsWithoutEmbeddings() ([]*DocumentInfo, error)
- func (s *Store) GetEmbeddingStatus(docID string) (*EmbeddingStatus, error)
- func (s *Store) GetImagesByDocumentFiltered(docID, section string, page int) ([]ImageInfo, error)
- func (s *Store) GetLastImportTime(source string) (time.Time, error)
- func (s *Store) HasEmbeddings(docID string) (bool, error)
- func (s *Store) IndexCustomData(data *CustomData, opts ...IndexOption) (*Document, error)
- func (s *Store) IndexCustomDataBatch(data []*CustomData, opts ...IndexOption) ([]*Document, error)
- func (s *Store) IndexDocument(path string, opts ...IndexOption) (*Document, error)
- func (s *Store) IndexDocumentWithProgress(path string, callback ProgressCallback, opts ...IndexOption) (*Document, error)
- func (s *Store) IndexReader(r io.Reader, name string, opts ...IndexOption) (*Document, error)
- func (s *Store) IsBackgroundRunning() bool
- func (s *Store) ListDocuments() ([]*DocumentInfo, error)
- func (s *Store) Repair() error
- func (s *Store) ResumeAllIncompleteEmbeddings() error
- func (s *Store) ResumeEmbedding(docID string) error
- func (s *Store) Search(query string, opts ...SearchOption) (*SearchResults, error)
- func (s *Store) SearchForAgent(query string, opts ...SearchOption) (*AgentSearchResponse, error)
- func (s *Store) SearchInDocument(docID, query string, opts ...SearchOption) (*SearchResults, error)
- func (s *Store) SetEmbeddingProvider(provider embedding.Provider) error
- func (s *Store) Stats() StoreStats
- func (s *Store) UpsertCustomData(data *CustomData, opts ...IndexOption) (*Document, error)
- func (s *Store) WaitForBackground() error
- type StoreHealth
- type StoreOption
- func WithCache(enabled bool, size int) StoreOption
- func WithChecksum(enabled bool) StoreOption
- func WithDedupCheck(enabled bool) StoreOption
- func WithHNSWConfig(cfg HNSWConfig) StoreOption
- func WithImageExtraction(enabled bool) StoreOption
- func WithMaxConcurrency(n int) StoreOption
- func WithNGrams(enabled bool, size int) StoreOption
- func WithSemanticAnalysis(enabled bool) StoreOption
- func WithStemming(enabled bool) StoreOption
- func WithStopWords(enabled bool) StoreOption
- type StoreStats
- type StreamError
- type TermEntry
- type TokenBudget
Constants ¶
const Version = "0.0.14"
Version is the current library version (semver)
Variables ¶
var ( // PDF parsing errors ErrInvalidPDF = errors.New("invalid PDF file") ErrCorruptedPDF = errors.New("corrupted PDF structure") ErrUnsupportedVersion = errors.New("unsupported PDF version") ErrEncryptedPDF = errors.New("encrypted PDF not supported") // DOCX parsing errors ErrInvalidDOCX = errors.New("invalid DOCX file") ErrCorruptedDOCX = errors.New("corrupted DOCX structure") ErrMissingContent = errors.New("missing document.xml in DOCX") // Feature errors ErrUnsupportedFeature = errors.New("unsupported PDF feature") ErrUnsupportedEncoding = errors.New("unsupported text encoding") ErrUnsupportedFilter = errors.New("unsupported stream filter") ErrUnsupportedFont = errors.New("unsupported font type") ErrUnsupportedImage = errors.New("unsupported image format") // Storage errors ErrDocumentNotFound = errors.New("document not found") ErrDocumentExists = errors.New("document already exists") ErrStorageCorrupted = errors.New("storage corrupted") ErrStorageFull = errors.New("storage full") // Search errors ErrSearchFailed = errors.New("search failed") ErrInvalidQuery = errors.New("invalid search query") ErrIndexCorrupted = errors.New("search index corrupted") // General errors ErrInvalidInput = errors.New("invalid input") ErrIOError = errors.New("I/O error") // CustomData errors ErrInvalidCustomData = errors.New("invalid custom data") ErrMissingSource = errors.New("custom data source is required") ErrMissingEntries = errors.New("custom data must have at least one entry") )
Sentinel errors for common cases
Functions ¶
func ChunkBlocks ¶
func ChunkBlocks(blocks []ContentBlock, opts ChunkOptions) [][]ContentBlock
ChunkBlocks regroups content blocks based on token limits
func ChunkSearchResults ¶
func ChunkSearchResults(results []SearchResult, maxTokens int) [][]SearchResult
ChunkSearchResults chunks search results to fit within a token budget
func CombineChunkedBlocks ¶
func CombineChunkedBlocks(blocks []ContentBlock, separator string) string
CombineChunkedBlocks combines a group of blocks into a single string
func EstimateAgentResultTokens ¶
func EstimateAgentResultTokens(results []AgentSearchResult) int
EstimateAgentResultTokens estimates total tokens for agent search results
func EstimateBlockTokens ¶
func EstimateBlockTokens(block *ContentBlock) int
EstimateBlockTokens estimates tokens for a content block
func EstimateContextTokens ¶
func EstimateContextTokens(ctx *ContextResult) int
EstimateContextTokens estimates tokens for a context window
func EstimateResultTokens ¶
func EstimateResultTokens(results []SearchResult) int
EstimateResultTokens estimates total tokens for search results
func EstimateTokens ¶
EstimateTokens estimates the token count for a given text string. This uses an approximation based on cl100k_base tokenizer behavior. For English text, it averages ~4 characters per token.
func FitsInContext ¶
FitsInContext checks if content fits within a token budget
func IsCustomDataError ¶
IsCustomDataError checks if an error is a custom data error
func IsObjectError ¶
IsObjectError checks if an error is a PDF object error
func IsParseError ¶
IsParseError checks if an error is a PDF parsing error
func IsSearchError ¶
IsSearchError checks if an error is a search error
func IsStorageError ¶
IsStorageError checks if an error is a storage error
func QueryTypeDescription ¶
QueryTypeDescription returns a human-readable description of the query type
func TruncateToTokenLimit ¶
TruncateToTokenLimit truncates text to approximately fit within a token limit
Types ¶
type AgentSearchResponse ¶
type AgentSearchResponse struct {
Query string `json:"query"`
QueryType QueryType `json:"query_type"`
Results []AgentSearchResult `json:"results"`
TotalHits int `json:"total_hits"`
SearchTime time.Duration `json:"search_time"`
EstimatedTokens int `json:"estimated_tokens"`
Metadata map[string]any `json:"metadata,omitempty"`
}
AgentSearchResponse provides AI agent-friendly search results
type AgentSearchResult ¶
type AgentSearchResult struct {
DocumentID string `json:"document_id"`
DocumentName string `json:"document_name"`
BlockID string `json:"block_id"`
Content string `json:"content"`
Snippet string `json:"snippet"`
Score float64 `json:"score"`
Page int `json:"page"`
Section string `json:"section"`
CitationRef string `json:"citation_ref"` // e.g., "[1]", "[2]"
TokenCount int `json:"token_count"`
Context []ContentBlock `json:"context,omitempty"`
Images []string `json:"images,omitempty"`
}
AgentSearchResult is a single result optimized for AI agent consumption
type BackgroundEmbeddingStatus ¶ added in v0.0.9
type BackgroundEmbeddingStatus struct {
Running bool `json:"running"` // Is background build in progress
StartedAt time.Time `json:"started_at"` // When build started
DocumentsTotal int `json:"documents_total"` // Total documents to process
DocumentsDone int `json:"documents_done"` // Documents processed so far
CurrentDocID string `json:"current_doc_id"` // Currently processing document
CurrentDocName string `json:"current_doc_name"`
ElapsedTime time.Duration `json:"elapsed_time"`
Error error `json:"error,omitempty"` // Error if failed
}
BackgroundEmbeddingStatus represents the status of background HNSW building
func (BackgroundEmbeddingStatus) Progress ¶ added in v0.0.9
func (s BackgroundEmbeddingStatus) Progress() float64
Progress returns the completion percentage (0-100)
type BoundingBox ¶
type BoundingBox struct {
X float64 `json:"x"` // Left edge in points
Y float64 `json:"y"` // Bottom edge in points (PDF coordinate system)
Width float64 `json:"width"` // Width in points
Height float64 `json:"height"` // Height in points
PageWidth float64 `json:"page_width"` // Page width for relative calculations
PageHeight float64 `json:"page_height"` // Page height for relative calculations
}
BoundingBox represents the position and size of content on a page
func (BoundingBox) RelativePosition ¶
func (b BoundingBox) RelativePosition() (xPct, yPct, wPct, hPct float64)
RelativePosition returns position as percentages of page dimensions
type Chunk ¶
type Chunk struct {
Content string `json:"content"`
StartIdx int `json:"start_idx"`
EndIdx int `json:"end_idx"`
TokenCount int `json:"token_count"`
}
Chunk represents a portion of content with token information
func ChunkContent ¶
func ChunkContent(content string, opts ChunkOptions) []Chunk
ChunkContent splits content into LLM-friendly chunks based on the provided options
type ChunkOptions ¶
type ChunkOptions struct {
MaxTokens int `json:"max_tokens"` // Maximum tokens per chunk (e.g., 512, 1024)
OverlapTokens int `json:"overlap_tokens"` // Token overlap between chunks
ChunkBy string `json:"chunk_by"` // "paragraph", "sentence", "tokens"
}
ChunkOptions configures how content is chunked for LLM context windows
func DefaultChunkOptions ¶
func DefaultChunkOptions() ChunkOptions
DefaultChunkOptions returns sensible defaults for chunking
type ContentBlock ¶
type ContentBlock struct {
ID string `json:"id"` // Unique block ID (e.g., "blk_001")
Type BlockType `json:"type"` // text, heading, image, etc.
Content string `json:"content"` // Text content or image path
Page int `json:"page"` // 1-indexed page number
BBox BoundingBox `json:"bbox"` // Position on page
Font *FontInfo `json:"font,omitempty"` // Font info for text
Semantic SemanticInfo `json:"semantic,omitempty"` // AI-friendly metadata
Children []string `json:"children,omitempty"` // Child block IDs for hierarchy
}
ContentBlock represents a unit of content with position and metadata
type ContextResult ¶
type ContextResult struct {
DocumentID string `json:"document_id"`
CenterID string `json:"center_id"` // The block we're getting context for
Before []ContentBlock `json:"before"` // Blocks before
Center ContentBlock `json:"center"` // The center block
After []ContentBlock `json:"after"` // Blocks after
}
ContextResult contains content blocks around a specific block
type CustomData ¶
type CustomData struct {
Source string `json:"source"` // Source identifier (e.g., "crm", "faq")
Name string `json:"name"` // Display name
Description string `json:"description,omitempty"`
Tags map[string]string `json:"tags,omitempty"` // Filter-only tags (not searched)
Entries []DataEntry `json:"entries"` // Data entries to index
ImportedAt time.Time `json:"imported_at,omitempty"` // When data was imported (for incremental updates)
ExternalID string `json:"external_id,omitempty"` // Unique ID from source system (for upsert)
Images []CustomImage `json:"images,omitempty"` // Document-level images (not tied to specific entry)
}
CustomData represents structured data to be indexed
type CustomDataError ¶
type CustomDataError struct {
Source string // Source identifier
Message string // What went wrong
Err error // Underlying error
}
CustomDataError indicates a custom data processing error
func NewCustomDataError ¶
func NewCustomDataError(source, message string, err error) *CustomDataError
NewCustomDataError creates a new CustomDataError
func (*CustomDataError) Error ¶
func (e *CustomDataError) Error() string
func (*CustomDataError) Unwrap ¶
func (e *CustomDataError) Unwrap() error
type CustomImage ¶ added in v0.0.5
type CustomImage struct {
Data []byte `json:"-"` // Image bytes (required, excluded from JSON)
Format string `json:"format"` // "png", "jpeg", "gif", "bmp" (required)
Width int `json:"width,omitempty"` // Optional, auto-detected if not provided
Height int `json:"height,omitempty"` // Optional, auto-detected if not provided
OriginalName string `json:"original_name,omitempty"` // Optional display name
Description string `json:"description,omitempty"` // AI-friendly alt text/description
}
CustomImage represents an image to be indexed with custom data
type DOCXError ¶
type DOCXError struct {
Part string // Which part of the DOCX (e.g., "word/document.xml")
Message string // What went wrong
Err error // Underlying error
}
DOCXError indicates a DOCX parsing or processing error
func NewDOCXError ¶
NewDOCXError creates a new DOCXError
type DataEntry ¶
type DataEntry struct {
ID string `json:"id,omitempty"` // Optional, auto-generated if empty
Content string `json:"content"` // Text content to index/embed
Type string `json:"type,omitempty"` // "text" (default), "json", "code"
Metadata map[string]string `json:"metadata,omitempty"` // Entry-specific metadata
Images []CustomImage `json:"images,omitempty"` // Images associated with this entry
}
DataEntry represents a single entry in custom data
type DatabaseInfo ¶ added in v0.0.5
type DatabaseInfo struct {
SchemaVersion int `json:"schema_version"` // Current schema version
LibraryVersion string `json:"library_version"` // Library version that created/migrated DB
CreatedAt time.Time `json:"created_at"` // When database was created
LastMigration time.Time `json:"last_migration"` // When last migration was applied
}
DatabaseInfo contains information about the database schema and version
type DedupResult ¶
type DedupResult struct {
IsDuplicate bool `json:"is_duplicate"`
ExistingID string `json:"existing_id,omitempty"`
ExistingName string `json:"existing_name,omitempty"`
Similarity float64 `json:"similarity"`
Method string `json:"method"` // "checksum", "content_hash", "embedding"
}
DedupResult contains information about duplicate detection
type Document ¶
type Document struct {
Info DocumentInfo `json:"info"`
Content DocumentContent `json:"content"`
}
Document represents a fully indexed document
func (*Document) GetBlockByID ¶
func (d *Document) GetBlockByID(id string) *ContentBlock
GetBlockByID finds a block by its ID
func (*Document) GetBlocksByPage ¶
func (d *Document) GetBlocksByPage(page int) []ContentBlock
GetBlocksByPage returns blocks for a specific page
func (*Document) GetImageBlocks ¶
func (d *Document) GetImageBlocks() []ContentBlock
GetImageBlocks returns only image-type blocks
func (*Document) GetTextBlocks ¶
func (d *Document) GetTextBlocks() []ContentBlock
GetTextBlocks returns only text-type blocks
type DocumentContent ¶
type DocumentContent struct {
Version string `json:"version"` // Schema version
Blocks []ContentBlock `json:"blocks"` // All content blocks
}
DocumentContent holds the structured content of a document
type DocumentFormat ¶
type DocumentFormat string
DocumentFormat represents the source document format
const ( FormatPDF DocumentFormat = "pdf" FormatDOCX DocumentFormat = "docx" FormatCustomData DocumentFormat = "customdata" // Custom data source )
type DocumentInfo ¶
type DocumentInfo struct {
ID string `json:"id"` // UUID
Name string `json:"name"` // Original filename
OriginalPath string `json:"original_path"` // Path when indexed
SizeBytes int64 `json:"size_bytes"` // File size
PageCount int `json:"page_count"` // Number of pages
Format DocumentFormat `json:"format"` // pdf, docx, customdata
Checksum string `json:"checksum"` // SHA-256 hash
CreatedAt time.Time `json:"created_at"` // When indexed
UpdatedAt time.Time `json:"updated_at"` // Last update
Source string `json:"source,omitempty"` // CustomData source identifier
Description string `json:"description,omitempty"` // CustomData description
ImportedAt time.Time `json:"imported_at,omitempty"` // CustomData import timestamp
ExternalID string `json:"external_id,omitempty"` // External identifier for upsert
}
DocumentInfo contains metadata about an indexed document
type EmbeddingStatus ¶
type EmbeddingStatus struct {
HasEmbeddings bool `json:"has_embeddings"` // True if any embeddings exist
IsComplete bool `json:"is_complete"` // True if all embeddable blocks have vectors
EmbeddedCount int `json:"embedded_count"` // Number of blocks with embeddings
TotalEmbeddable int `json:"total_embeddable"` // Number of blocks that can be embedded
Model string `json:"model,omitempty"` // Embedding model used
Dimension int `json:"dimension,omitempty"` // Vector dimension
LastUpdated time.Time `json:"last_updated,omitempty"` // When embeddings were last updated
}
EmbeddingStatus contains information about a document's embedding state
func (*EmbeddingStatus) Progress ¶
func (e *EmbeddingStatus) Progress() float64
Progress returns embedding completion as a percentage (0-100)
type Filter ¶
type Filter struct {
// contains filtered or unexported fields
}
Filter provides a fluent API for building search filters
func (*Filter) Build ¶
func (f *Filter) Build() *FilterConfig
Build converts the Filter to FilterConfig for internal use
func (*Filter) ExternalIDs ¶
ExternalIDs filters by external identifiers
func (*Filter) GetDateRange ¶
GetDateRange returns the date range filter
func (*Filter) GetExternalIDs ¶
GetExternalIDs returns the external ID filters
func (*Filter) GetFormats ¶
GetFormats returns the format filters
func (*Filter) GetHasEmbeddings ¶
GetHasEmbeddings returns the embeddings filter
func (*Filter) GetMaxPageCount ¶
GetMaxPageCount returns the maximum page count filter
func (*Filter) GetMinPageCount ¶
GetMinPageCount returns the minimum page count filter
func (*Filter) GetSources ¶
GetSources returns the source filters
func (*Filter) HasEmbeddings ¶
HasEmbeddings filters documents that have (or don't have) embeddings
type FilterConfig ¶
type FilterConfig struct {
Sources []string
Formats []string
Tags map[string]string
DateStart time.Time
DateEnd time.Time
MinPageCount int
MaxPageCount int
HasEmbeddings *bool
ExternalIDs []string
}
FilterConfig is the internal representation used by search
type FontError ¶
type FontError struct {
FontName string // Font name
Message string // What went wrong
Err error // Underlying error
}
FontError indicates an error processing a font
func NewFontError ¶
NewFontError creates a new FontError
type FontInfo ¶
type FontInfo struct {
Name string `json:"name"` // Font name (e.g., "Helvetica-Bold")
Size float64 `json:"size"` // Font size in points
Bold bool `json:"bold,omitempty"` // Is bold
Italic bool `json:"italic,omitempty"` // Is italic
}
FontInfo contains font metadata for text content
type HNSWConfig ¶ added in v0.0.6
type HNSWConfig struct {
M int // Max connections per layer (default: 16, range: 4-64)
EfConst int // Construction ef parameter (default: 200, range: 10-500)
EfSearch int // Search ef parameter (default: 50, range: 10-500)
}
HNSWConfig configures the HNSW vector index parameters
type ImageInfo ¶
type ImageInfo struct {
ID string `json:"id"` // Image UUID
DocumentID string `json:"document_id,omitempty"` // Parent document ID
BlockID string `json:"block_id,omitempty"` // Associated content block ID
Format string `json:"format"` // png, jpeg, etc.
Width int `json:"width"` // Image width in pixels
Height int `json:"height"` // Image height in pixels
Page int `json:"page"` // Page number
OriginalName string `json:"original_name,omitempty"` // Original image name from PDF/DOCX
}
ImageInfo contains metadata about an extracted image
type IndexOption ¶
type IndexOption func(*indexConfig)
IndexOption configures indexing behavior
func WithDeferEmbedding ¶ added in v0.0.6
func WithDeferEmbedding(defer_ bool) IndexOption
WithDeferEmbedding skips embedding generation during indexing. Use store.EmbedPendingDocuments() or store.EmbedDocuments() to generate embeddings later in a batch operation. This is recommended for bulk imports. See OPTIMISATIONS.md for recommended patterns.
func WithForceReindex ¶
func WithForceReindex(force bool) IndexOption
WithForceReindex forces re-indexing even if document exists
func WithIndexSource ¶ added in v0.0.13
func WithIndexSource(source string) IndexOption
WithIndexSource sets a custom source identifier for the document. This overrides the default format-based source ("pdf" or "docx"). Use this for logical categorization like "knowledgebase", "manual", etc.
func WithIndexTags ¶ added in v0.0.13
func WithIndexTags(tags map[string]string) IndexOption
WithIndexTags sets metadata tags for the document. Tags enable filtering in search queries via WithTags() search option.
func WithProgressCallback ¶
func WithProgressCallback(fn ProgressCallback) IndexOption
WithProgressCallback sets a callback for progress updates during indexing
func WithSourcePath ¶
func WithSourcePath(path string) IndexOption
WithSourcePath sets the original source path for metadata
type IndexProgress ¶
type IndexProgress struct {
DocumentID string `json:"document_id"`
DocumentName string `json:"document_name"`
Status string `json:"status"` // "parsing", "extracting", "indexing", "embedding", "complete", "error"
TotalPages int `json:"total_pages"`
ProcessedPages int `json:"processed_pages"`
TotalBlocks int `json:"total_blocks"`
ProcessedBlocks int `json:"processed_blocks"`
Error error `json:"error,omitempty"`
StartTime time.Time `json:"start_time"`
ElapsedTime time.Duration `json:"elapsed_time"`
}
IndexProgress reports progress during document indexing
type ObjectError ¶
type ObjectError struct {
ObjectNum int // Object number
GenNum int // Generation number
Message string // What went wrong
Err error // Underlying error
}
ObjectError indicates an error with a specific PDF object
func NewObjectError ¶
func NewObjectError(objNum, genNum int, message string, err error) *ObjectError
NewObjectError creates a new ObjectError
func (*ObjectError) Error ¶
func (e *ObjectError) Error() string
func (*ObjectError) Unwrap ¶
func (e *ObjectError) Unwrap() error
type Page ¶
type Page struct {
Number int `json:"number"`
Width float64 `json:"width"`
Height float64 `json:"height"`
Blocks []ContentBlock `json:"blocks"`
}
Page represents a single page with its content
type PageError ¶
type PageError struct {
PageNum int // 1-indexed page number
Message string // What went wrong
Err error // Underlying error
}
PageError indicates an error processing a specific page
func NewPageError ¶
NewPageError creates a new PageError
type ParseError ¶
type ParseError struct {
Op string // Operation that failed (e.g., "lexer.readToken")
Offset int64 // Byte offset in file where error occurred
Message string // Human-readable message
Err error // Underlying error
}
ParseError provides detailed information about PDF parsing failures
func NewParseError ¶
func NewParseError(op string, offset int64, message string, err error) *ParseError
NewParseError creates a new ParseError
func (*ParseError) Error ¶
func (e *ParseError) Error() string
func (*ParseError) Unwrap ¶
func (e *ParseError) Unwrap() error
type Posting ¶
type Posting struct {
DocumentID string `json:"doc_id"`
BlockID string `json:"block_id"`
Positions []int `json:"positions"` // Positions within the block text
TF float64 `json:"tf"` // Term frequency for this posting
}
Posting represents a term occurrence in the index
type ProgressCallback ¶
type ProgressCallback func(IndexProgress)
ProgressCallback is called during document indexing to report progress
type QueryType ¶
type QueryType string
QueryType represents the detected intent of a search query
const ( // QueryTypeFactual for questions like "What is X?", "Who is Y?" QueryTypeFactual QueryType = "factual" QueryTypeNavigation QueryType = "navigation" // QueryTypeSummary for "Summarize...", "Overview of..." QueryTypeSummary QueryType = "summary" // QueryTypeComparison for "Compare X and Y", "Difference between..." QueryTypeComparison QueryType = "comparison" // QueryTypeDefinition for "Define X", "What is the definition of..." QueryTypeDefinition QueryType = "definition" // QueryTypeList for "List all X", "Enumerate...", "What are all..." QueryTypeList QueryType = "list" // QueryTypeUnknown when intent cannot be determined QueryTypeUnknown QueryType = "unknown" )
func DetectQueryType ¶
DetectQueryType analyzes a search query and returns its detected intent
type SearchDiagnostics ¶ added in v0.0.14
type SearchDiagnostics struct {
KeywordResults int `json:"keyword_results"` // Results from BM25 keyword search
VectorResults int `json:"vector_results"` // Results from vector/semantic search
KeywordTime time.Duration `json:"keyword_time"` // Time spent on keyword search
VectorTime time.Duration `json:"vector_time"` // Time spent on vector search
FusionTime time.Duration `json:"fusion_time"` // Time spent fusing results
FilteredByScore int `json:"filtered_by_score"` // Results filtered by MinScore
DiversifiedFrom int `json:"diversified_from"` // Results before diversification (0 if not applied)
}
SearchDiagnostics provides detailed information about how search was executed. Useful for debugging and optimizing search performance.
type SearchError ¶
type SearchError struct {
Query string // The search query
Message string // What went wrong
Err error // Underlying error
}
SearchError indicates a search operation failure
func NewSearchError ¶
func NewSearchError(query, message string, err error) *SearchError
NewSearchError creates a new SearchError
func (*SearchError) Error ¶
func (e *SearchError) Error() string
func (*SearchError) Unwrap ¶
func (e *SearchError) Unwrap() error
type SearchMode ¶
type SearchMode string
SearchMode defines the type of search
const ( // SearchModeKeyword uses BM25 keyword search only SearchModeKeyword SearchMode = "keyword" // SearchModeSemantic uses vector similarity search only SearchModeSemantic SearchMode = "semantic" // SearchModeHybrid combines BM25 and vector search with RRF fusion SearchModeHybrid SearchMode = "hybrid" )
func SuggestedSearchMode ¶
func SuggestedSearchMode(qt QueryType) SearchMode
SuggestedSearchMode returns the recommended search mode for a query type
type SearchOption ¶
type SearchOption func(*searchConfig)
SearchOption configures search behavior
func WithAgentOutput ¶
func WithAgentOutput(enabled bool) SearchOption
WithAgentOutput enables agent-friendly output format
func WithChunking ¶
func WithChunking(opts ChunkOptions) SearchOption
WithChunking configures result chunking for LLM context windows
func WithCitations ¶
func WithCitations(enabled bool) SearchOption
WithCitations adds citation references [1], [2], etc. to results
func WithContextWindow ¶
func WithContextWindow(blocks int) SearchOption
WithContextWindow sets the number of surrounding blocks to include
func WithDiagnostics ¶ added in v0.0.14
func WithDiagnostics(enabled bool) SearchOption
WithDiagnostics enables detailed search diagnostics in the results. Diagnostics include timing breakdowns, result counts per search type, and filtering stats. Useful for debugging and optimizing search performance.
func WithDiversify ¶ added in v0.0.14
func WithDiversify(maxPerDoc int) SearchOption
WithDiversify limits the number of results per document to improve variety. This is useful when multiple blocks from the same document match the query. Set maxPerDoc to 0 (default) for unlimited results per document.
func WithDocuments ¶
func WithDocuments(docIDs ...string) SearchOption
WithDocuments limits search to specific documents
func WithEfSearch ¶ added in v0.0.11
func WithEfSearch(ef int) SearchOption
WithEfSearch overrides the HNSW efSearch parameter for this query. Higher values improve recall at the cost of latency. If ef <= 0 (default), the store's configured EfSearch value is used. Recommended: 50 for speed, 100 for balanced, 200+ for high recall.
func WithEstimateTokens ¶
func WithEstimateTokens(enabled bool) SearchOption
WithEstimateTokens includes token count estimates in results
func WithHighlight ¶
func WithHighlight(pre, post string) SearchOption
WithHighlight sets the highlight markers for matched terms
func WithImages ¶
func WithImages(include bool) SearchOption
WithImages includes image blocks in search results
func WithKeywordWeight ¶
func WithKeywordWeight(weight float64) SearchOption
WithKeywordWeight sets the weight for keyword search in hybrid mode (0-1)
func WithMaxResults ¶
func WithMaxResults(n int) SearchOption
WithMaxResults sets the maximum number of results
func WithMetadata ¶ added in v0.0.12
func WithMetadata(include bool) SearchOption
WithMetadata includes document metadata (tags, source, external ID) in search results. This adds extra database queries per result, so it's disabled by default for performance.
func WithMinScore ¶
func WithMinScore(score float64) SearchOption
WithMinScore sets the minimum relevance score threshold
func WithPageRange ¶
func WithPageRange(start, end int) SearchOption
WithPageRange limits search to a specific page range
func WithSearchMode ¶
func WithSearchMode(mode SearchMode) SearchOption
WithSearchMode sets the search mode (keyword, semantic, or hybrid)
func WithSections ¶
func WithSections(sections ...string) SearchOption
WithSections limits search to specific sections
func WithSources ¶
func WithSources(sources ...string) SearchOption
WithSources filters search results by source or format (e.g., "pdf", "docx", "crm")
func WithTags ¶
func WithTags(tags map[string]string) SearchOption
WithTags filters search results by tags (AND logic - all must match)
func WithVectorWeight ¶
func WithVectorWeight(weight float64) SearchOption
WithVectorWeight sets the weight for vector search in hybrid mode (0-1)
type SearchResult ¶
type SearchResult struct {
DocumentID string `json:"document_id"`
DocumentName string `json:"document_name"`
BlockID string `json:"block_id"`
Content string `json:"content"` // Matched content
Snippet string `json:"snippet"` // Highlighted snippet
Score float64 `json:"score"` // Relevance score
Page int `json:"page"` // Page number
Section string `json:"section"` // Section name
Context []ContentBlock `json:"context"` // Surrounding blocks for RAG
Positions []int `json:"positions"` // Match positions in content
Images []string `json:"images,omitempty"` // Image paths in same section
Tags map[string]string `json:"tags,omitempty"` // Document tags
Source string `json:"source,omitempty"` // Source identifier (for CustomData)
ExternalID string `json:"external_id,omitempty"` // External system ID
}
SearchResult represents a single search hit
type SearchResults ¶
type SearchResults struct {
Query string `json:"query"`
TotalHits int `json:"total_hits"`
Results []SearchResult `json:"results"`
SearchTime time.Duration `json:"search_time"`
Diagnostics *SearchDiagnostics `json:"diagnostics,omitempty"`
}
SearchResults contains search results with metadata
type SemanticInfo ¶
type SemanticInfo struct {
IsHeading bool `json:"is_heading,omitempty"`
HeadingLevel int `json:"heading_level,omitempty"` // 1-6 like HTML
Section string `json:"section,omitempty"` // Parent section title
Keywords []string `json:"keywords,omitempty"` // Extracted keywords
Context string `json:"context,omitempty"` // Surrounding context summary
}
SemanticInfo contains AI-friendly metadata about content
type StorageError ¶
type StorageError struct {
Op string // Operation (e.g., "write", "read", "delete")
Path string // File or directory path
Message string // What went wrong
Err error // Underlying error
}
StorageError indicates a storage operation failure
func NewStorageError ¶
func NewStorageError(op, path, message string, err error) *StorageError
NewStorageError creates a new StorageError
func (*StorageError) Error ¶
func (e *StorageError) Error() string
func (*StorageError) Unwrap ¶
func (e *StorageError) Unwrap() error
type Store ¶
type Store struct {
// contains filtered or unexported fields
}
Store manages document storage, indexing, and search with unified SQLite backend
func NewStore ¶
func NewStore(basePath string, opts ...StoreOption) (*Store, error)
NewStore creates a new document store at the specified path using SQLite
func (*Store) CancelBackground ¶ added in v0.0.9
func (s *Store) CancelBackground()
CancelBackground cancels background embedding if running. The operation will stop after the current document completes.
func (*Store) CheckDuplicate ¶
func (s *Store) CheckDuplicate(path string) (*DedupResult, error)
CheckDuplicate checks if a document at the given path is a duplicate of an existing document. It uses file checksum for comparison.
func (*Store) CheckDuplicateByContent ¶
func (s *Store) CheckDuplicateByContent(data []byte) (*DedupResult, error)
CheckDuplicateByContent checks if content already exists in the store. It computes a content hash from the provided data and checks for matches.
func (*Store) CheckHealth ¶ added in v0.0.8
func (s *Store) CheckHealth() (*StoreHealth, error)
CheckHealth performs a comprehensive consistency check on the store. It checks for HNSW-SQLite synchronization, incomplete embeddings, and other issues.
func (*Store) DatabaseInfo ¶ added in v0.0.5
func (s *Store) DatabaseInfo() (*DatabaseInfo, error)
DatabaseInfo returns information about the database schema and version
func (*Store) DeleteDocument ¶
DeleteDocument removes a document from the store
func (*Store) DetectQueryType ¶
DetectQueryType analyzes a query string and returns its detected intent type
func (*Store) EmbedDocuments ¶ added in v0.0.6
EmbedDocuments generates embeddings for specific documents by ID. This is useful for resumable batch processing. OPTIMIZED: Saves HNSW index only once at the end instead of after each document.
func (*Store) EmbedPendingDocuments ¶ added in v0.0.6
EmbedPendingDocuments generates embeddings for all documents that don't have them yet. This is the main method for deferred embedding patterns. OPTIMIZED: Saves HNSW index only once at the end instead of after each document.
func (*Store) EmbedPendingDocumentsAsync ¶ added in v0.0.9
EmbedPendingDocumentsAsync starts embedding in background. Returns immediately. Use GetBackgroundStatus() to check progress, IsBackgroundRunning() to check if still running, or WaitForBackground() to block.
func (*Store) FindByExternalID ¶ added in v0.0.11
FindByExternalID finds a document by source and external ID
func (*Store) GetBackgroundStatus ¶ added in v0.0.9
func (s *Store) GetBackgroundStatus() BackgroundEmbeddingStatus
GetBackgroundStatus returns the current status of background embedding. Safe to call even when no background operation is running.
func (*Store) GetContext ¶
func (s *Store) GetContext(docID, blockID string, windowSize int) (*ContextResult, error)
GetContext retrieves content blocks around a specific block
func (*Store) GetDocument ¶
GetDocument retrieves a document by ID
func (*Store) GetDocumentsWithIncompleteEmbeddings ¶ added in v0.0.8
func (s *Store) GetDocumentsWithIncompleteEmbeddings() ([]*DocumentInfo, error)
GetDocumentsWithIncompleteEmbeddings returns documents that have some but not all blocks embedded. This identifies documents where embedding was interrupted mid-way and need recovery.
func (*Store) GetDocumentsWithoutEmbeddings ¶ added in v0.0.6
func (s *Store) GetDocumentsWithoutEmbeddings() ([]*DocumentInfo, error)
GetDocumentsWithoutEmbeddings returns documents that have embeddable content but don't have any embeddings yet. Use this for resumable maintenance tasks.
func (*Store) GetEmbeddingStatus ¶
func (s *Store) GetEmbeddingStatus(docID string) (*EmbeddingStatus, error)
GetEmbeddingStatus returns the embedding status for a document
func (*Store) GetImagesByDocumentFiltered ¶
GetImagesByDocumentFiltered returns images for a document with optional section/page filters
func (*Store) GetLastImportTime ¶
GetLastImportTime returns the most recent import timestamp for a given source Returns zero time if no imports found for the source
func (*Store) HasEmbeddings ¶
HasEmbeddings is a convenience method that returns true if a document has any embeddings
func (*Store) IndexCustomData ¶
func (s *Store) IndexCustomData(data *CustomData, opts ...IndexOption) (*Document, error)
IndexCustomData indexes custom structured data
func (*Store) IndexCustomDataBatch ¶ added in v0.0.6
func (s *Store) IndexCustomDataBatch(data []*CustomData, opts ...IndexOption) ([]*Document, error)
IndexCustomDataBatch indexes multiple custom data entries efficiently. This is optimized for bulk imports with deferred global stats updates.
func (*Store) IndexDocument ¶
func (s *Store) IndexDocument(path string, opts ...IndexOption) (*Document, error)
IndexDocument indexes a document from a file path
func (*Store) IndexDocumentWithProgress ¶
func (s *Store) IndexDocumentWithProgress(path string, callback ProgressCallback, opts ...IndexOption) (*Document, error)
IndexDocumentWithProgress indexes a document with progress callbacks
func (*Store) IndexReader ¶
IndexReader indexes a document from an io.Reader
func (*Store) IsBackgroundRunning ¶ added in v0.0.9
IsBackgroundRunning returns true if background embedding is in progress.
func (*Store) ListDocuments ¶
func (s *Store) ListDocuments() ([]*DocumentInfo, error)
ListDocuments returns all indexed documents
func (*Store) Repair ¶ added in v0.0.8
Repair fixes detected inconsistencies in the store. It rebuilds the HNSW index from SQLite and resumes incomplete embeddings. Returns nil if no repairs were needed or all repairs succeeded.
func (*Store) ResumeAllIncompleteEmbeddings ¶ added in v0.0.8
ResumeAllIncompleteEmbeddings resumes embedding for all documents with incomplete embeddings. This is useful for recovering from crashes or interruptions during batch embedding.
func (*Store) ResumeEmbedding ¶ added in v0.0.8
ResumeEmbedding continues embedding for a document that was partially embedded. Only embeds blocks that don't already have vectors, making it safe to call on documents that were interrupted during embedding.
func (*Store) Search ¶
func (s *Store) Search(query string, opts ...SearchOption) (*SearchResults, error)
Search performs a search across all documents
func (*Store) SearchForAgent ¶
func (s *Store) SearchForAgent(query string, opts ...SearchOption) (*AgentSearchResponse, error)
SearchForAgent performs a search optimized for AI agent consumption. Returns structured output with token estimates, citation references, and chunked results.
func (*Store) SearchInDocument ¶
func (s *Store) SearchInDocument(docID, query string, opts ...SearchOption) (*SearchResults, error)
SearchInDocument searches within a specific document
func (*Store) SetEmbeddingProvider ¶
SetEmbeddingProvider configures the embedding provider after store creation. It automatically detects and repairs inconsistencies between the HNSW index and SQLite vectors, rebuilding the index if necessary.
func (*Store) UpsertCustomData ¶
func (s *Store) UpsertCustomData(data *CustomData, opts ...IndexOption) (*Document, error)
UpsertCustomData updates an existing document or creates a new one based on source + external_id. If ExternalID is provided and a document with the same source + external_id exists, it will be updated. If ExternalID is empty or no matching document exists, a new document is created.
func (*Store) WaitForBackground ¶ added in v0.0.9
WaitForBackground blocks until background embedding completes. Returns the error from background embedding if any, or nil if successful. Returns nil immediately if no background operation is running.
type StoreHealth ¶ added in v0.0.8
type StoreHealth struct {
IsHealthy bool `json:"is_healthy"` // True if all checks pass
HNSWSize int `json:"hnsw_size"` // Number of vectors in HNSW index
SQLiteVectorCount int `json:"sqlite_vector_count"` // Number of vectors in SQLite
HNSWSynced bool `json:"hnsw_synced"` // True if HNSW matches SQLite
IncompleteEmbeddings []string `json:"incomplete_embeddings"` // Document IDs with partial embeddings
PendingEmbeddings []string `json:"pending_embeddings"` // Document IDs without any embeddings
DocumentCount int `json:"document_count"` // Total number of documents
BlockCount int `json:"block_count"` // Total number of content blocks
}
StoreHealth contains consistency check results for diagnosing store issues
type StoreOption ¶
type StoreOption func(*storeConfig)
StoreOption configures Store behavior
func WithCache ¶
func WithCache(enabled bool, size int) StoreOption
WithCache enables/disables object caching
func WithChecksum ¶
func WithChecksum(enabled bool) StoreOption
WithChecksum enables/disables document checksum computation
func WithDedupCheck ¶
func WithDedupCheck(enabled bool) StoreOption
WithDedupCheck enables duplicate detection before indexing
func WithHNSWConfig ¶ added in v0.0.6
func WithHNSWConfig(cfg HNSWConfig) StoreOption
WithHNSWConfig configures the HNSW vector index parameters. Use this to tune performance vs quality trade-offs for bulk imports. See OPTIMISATIONS.md for recommended settings by dataset size.
func WithImageExtraction ¶
func WithImageExtraction(enabled bool) StoreOption
WithImageExtraction enables/disables image extraction
func WithMaxConcurrency ¶
func WithMaxConcurrency(n int) StoreOption
WithMaxConcurrency sets the maximum concurrent operations
func WithNGrams ¶
func WithNGrams(enabled bool, size int) StoreOption
WithNGrams enables n-gram indexing for fuzzy search
func WithSemanticAnalysis ¶
func WithSemanticAnalysis(enabled bool) StoreOption
WithSemanticAnalysis enables/disables semantic analysis
func WithStemming ¶
func WithStemming(enabled bool) StoreOption
WithStemming enables/disables Porter stemming in search
func WithStopWords ¶
func WithStopWords(enabled bool) StoreOption
WithStopWords enables/disables stop word filtering
type StoreStats ¶
type StoreStats struct {
DocumentCount int `json:"document_count"`
TotalBlocks int `json:"total_blocks"`
TotalImages int `json:"total_images"`
IndexTerms int `json:"index_terms"`
VectorCount int `json:"vector_count"`
StorageBytes int64 `json:"storage_bytes"`
}
StoreStats contains statistics about the store
type StreamError ¶
type StreamError struct {
Filter string // Filter name (e.g., "FlateDecode")
Message string // What went wrong
Err error // Underlying error
}
StreamError indicates an error decoding a stream
func NewStreamError ¶
func NewStreamError(filter, message string, err error) *StreamError
NewStreamError creates a new StreamError
func (*StreamError) Error ¶
func (e *StreamError) Error() string
func (*StreamError) Unwrap ¶
func (e *StreamError) Unwrap() error
type TermEntry ¶
type TermEntry struct {
Term string `json:"term"`
DF int `json:"df"` // Document frequency
Postings []Posting `json:"postings"` // All occurrences
}
TermEntry contains all postings for a term
type TokenBudget ¶
TokenBudget helps track token usage across multiple operations
func NewTokenBudget ¶
func NewTokenBudget(maxTokens int) *TokenBudget
NewTokenBudget creates a new token budget tracker
func (*TokenBudget) Add ¶
func (b *TokenBudget) Add(tokens int) bool
Add adds tokens to the budget, returns true if within budget
func (*TokenBudget) AddText ¶
func (b *TokenBudget) AddText(text string) bool
AddText estimates and adds tokens for text, returns true if within budget
func (*TokenBudget) IsExhausted ¶
func (b *TokenBudget) IsExhausted() bool
IsExhausted returns true if budget is exhausted
func (*TokenBudget) Remaining ¶
func (b *TokenBudget) Remaining() int
Remaining returns remaining token budget
func (*TokenBudget) Usage ¶
func (b *TokenBudget) Usage() float64
Usage returns the current usage percentage (0-100)
Source Files
¶
Directories
¶
| Path | Synopsis |
|---|---|
|
cmd
|
|
|
test_pdf
command
|
|
|
internal
|
|
|
nlp
Package nlp provides natural language processing utilities for text analysis.
|
Package nlp provides natural language processing utilities for text analysis. |
|
Package search provides hybrid search functionality combining keyword and vector search.
|
Package search provides hybrid search functionality combining keyword and vector search. |