ragfs_core/
traits.rs

1//! Core traits for RAGFS components.
2//!
3//! This module defines the trait interfaces that all RAGFS components implement:
4//!
5//! - [`ContentExtractor`]: Extract content from files
6//! - [`Chunker`]: Split content into chunks
7//! - [`Embedder`]: Generate vector embeddings
8//! - [`VectorStore`]: Store and search vectors
9//! - [`Indexer`]: Coordinate the indexing pipeline
10//!
11//! These traits enable a pluggable architecture where different implementations
12//! can be swapped without changing the rest of the system.
13
14use async_trait::async_trait;
15use std::path::Path;
16
17use crate::error::{ChunkError, EmbedError, ExtractError, StoreError};
18use crate::types::{
19    Chunk, ChunkConfig, ChunkOutput, ContentType, EmbeddingConfig, EmbeddingOutput,
20    ExtractedContent, FileRecord, IndexStats, Modality, SearchQuery, SearchResult, StoreStats,
21};
22
23// ============================================================================
24// Content Extraction
25// ============================================================================
26
27/// Trait for extracting content from files.
28#[async_trait]
29pub trait ContentExtractor: Send + Sync {
30    /// Returns the MIME types this extractor can handle.
31    fn supported_types(&self) -> &[&str];
32
33    /// Check if this extractor can handle the given file.
34    fn can_extract(&self, path: &Path, mime_type: &str) -> bool {
35        self.supported_types().contains(&mime_type) || self.can_extract_by_extension(path)
36    }
37
38    /// Check if extractor can handle based on file extension.
39    fn can_extract_by_extension(&self, _path: &Path) -> bool {
40        false
41    }
42
43    /// Extract content from a file.
44    async fn extract(&self, path: &Path) -> Result<ExtractedContent, ExtractError>;
45
46    /// Extract content from bytes (for embedded content).
47    async fn extract_bytes(
48        &self,
49        _data: &[u8],
50        _mime_type: &str,
51    ) -> Result<ExtractedContent, ExtractError> {
52        Err(ExtractError::UnsupportedType(
53            "byte extraction not supported".to_string(),
54        ))
55    }
56}
57
58// ============================================================================
59// Chunking
60// ============================================================================
61
62/// Trait for splitting content into chunks.
63#[async_trait]
64pub trait Chunker: Send + Sync {
65    /// Name of this chunking strategy.
66    fn name(&self) -> &str;
67
68    /// Content types this chunker is designed for.
69    fn content_types(&self) -> &[&str];
70
71    /// Check if this chunker can handle the given content type.
72    fn can_chunk(&self, content_type: &ContentType) -> bool;
73
74    /// Chunk the extracted content.
75    async fn chunk(
76        &self,
77        content: &ExtractedContent,
78        config: &ChunkConfig,
79    ) -> Result<Vec<ChunkOutput>, ChunkError>;
80}
81
82// ============================================================================
83// Embedding
84// ============================================================================
85
86/// Trait for generating embeddings.
87#[async_trait]
88pub trait Embedder: Send + Sync {
89    /// Model name/identifier.
90    fn model_name(&self) -> &str;
91
92    /// Embedding dimension.
93    fn dimension(&self) -> usize;
94
95    /// Maximum tokens per input.
96    fn max_tokens(&self) -> usize;
97
98    /// Supported modalities.
99    fn modalities(&self) -> &[Modality];
100
101    /// Embed text content.
102    async fn embed_text(
103        &self,
104        texts: &[&str],
105        config: &EmbeddingConfig,
106    ) -> Result<Vec<EmbeddingOutput>, EmbedError>;
107
108    /// Embed image content.
109    async fn embed_image(
110        &self,
111        _image_data: &[u8],
112        _config: &EmbeddingConfig,
113    ) -> Result<EmbeddingOutput, EmbedError> {
114        Err(EmbedError::ModalityNotSupported(Modality::Image))
115    }
116
117    /// Embed a query (may use different instruction).
118    async fn embed_query(
119        &self,
120        query: &str,
121        config: &EmbeddingConfig,
122    ) -> Result<EmbeddingOutput, EmbedError> {
123        let results = self.embed_text(&[query], config).await?;
124        results
125            .into_iter()
126            .next()
127            .ok_or_else(|| EmbedError::Inference("empty embedding result".to_string()))
128    }
129}
130
131// ============================================================================
132// Vector Storage
133// ============================================================================
134
135/// Trait for vector storage and search.
136#[async_trait]
137pub trait VectorStore: Send + Sync {
138    /// Initialize the store.
139    async fn init(&self) -> Result<(), StoreError>;
140
141    /// Insert or update chunks.
142    async fn upsert_chunks(&self, chunks: &[Chunk]) -> Result<(), StoreError>;
143
144    /// Search for similar chunks.
145    async fn search(&self, query: SearchQuery) -> Result<Vec<SearchResult>, StoreError>;
146
147    /// Hybrid search (vector + full-text).
148    async fn hybrid_search(&self, query: SearchQuery) -> Result<Vec<SearchResult>, StoreError>;
149
150    /// Delete all chunks for a file.
151    async fn delete_by_file_path(&self, path: &Path) -> Result<u64, StoreError>;
152
153    /// Update file path for all chunks (for renames).
154    async fn update_file_path(&self, from: &Path, to: &Path) -> Result<u64, StoreError>;
155
156    /// Get all chunks for a file.
157    async fn get_chunks_for_file(&self, path: &Path) -> Result<Vec<Chunk>, StoreError>;
158
159    /// Get file record.
160    async fn get_file(&self, path: &Path) -> Result<Option<FileRecord>, StoreError>;
161
162    /// Upsert file record.
163    async fn upsert_file(&self, record: &FileRecord) -> Result<(), StoreError>;
164
165    /// Get store statistics.
166    async fn stats(&self) -> Result<StoreStats, StoreError>;
167
168    /// Get all chunks in the store.
169    ///
170    /// This is useful for operations that need to iterate over all indexed content,
171    /// such as duplicate detection or bulk analysis.
172    async fn get_all_chunks(&self) -> Result<Vec<Chunk>, StoreError>;
173
174    /// Get all file records in the store.
175    ///
176    /// This is useful for operations that need to iterate over all indexed files,
177    /// such as organization planning or cleanup analysis.
178    async fn get_all_files(&self) -> Result<Vec<FileRecord>, StoreError>;
179}
180
181// ============================================================================
182// Indexer
183// ============================================================================
184
185/// Trait for file indexing coordination.
186#[async_trait]
187pub trait Indexer: Send + Sync {
188    /// Start watching a directory for changes.
189    async fn watch(&self, path: &Path) -> Result<(), crate::Error>;
190
191    /// Stop watching.
192    async fn stop(&self) -> Result<(), crate::Error>;
193
194    /// Manually trigger indexing of a path.
195    async fn index(&self, path: &Path, force: bool) -> Result<(), crate::Error>;
196
197    /// Get current index statistics.
198    async fn stats(&self) -> Result<IndexStats, crate::Error>;
199
200    /// Check if a file needs re-indexing.
201    async fn needs_reindex(&self, path: &Path) -> Result<bool, crate::Error>;
202}