ragfs_core/traits.rs
1//! Core traits for RAGFS components.
2//!
3//! This module defines the trait interfaces that all RAGFS components implement:
4//!
5//! - [`ContentExtractor`]: Extract content from files
6//! - [`Chunker`]: Split content into chunks
7//! - [`Embedder`]: Generate vector embeddings
8//! - [`VectorStore`]: Store and search vectors
9//! - [`Indexer`]: Coordinate the indexing pipeline
10//!
11//! These traits enable a pluggable architecture where different implementations
12//! can be swapped without changing the rest of the system.
13
14use async_trait::async_trait;
15use std::path::Path;
16
17use crate::error::{ChunkError, EmbedError, ExtractError, StoreError};
18use crate::types::{
19 Chunk, ChunkConfig, ChunkOutput, ContentType, EmbeddingConfig, EmbeddingOutput,
20 ExtractedContent, FileRecord, IndexStats, Modality, SearchQuery, SearchResult, StoreStats,
21};
22
23// ============================================================================
24// Content Extraction
25// ============================================================================
26
27/// Trait for extracting content from files.
28#[async_trait]
29pub trait ContentExtractor: Send + Sync {
30 /// Returns the MIME types this extractor can handle.
31 fn supported_types(&self) -> &[&str];
32
33 /// Check if this extractor can handle the given file.
34 fn can_extract(&self, path: &Path, mime_type: &str) -> bool {
35 self.supported_types().contains(&mime_type) || self.can_extract_by_extension(path)
36 }
37
38 /// Check if extractor can handle based on file extension.
39 fn can_extract_by_extension(&self, _path: &Path) -> bool {
40 false
41 }
42
43 /// Extract content from a file.
44 async fn extract(&self, path: &Path) -> Result<ExtractedContent, ExtractError>;
45
46 /// Extract content from bytes (for embedded content).
47 async fn extract_bytes(
48 &self,
49 _data: &[u8],
50 _mime_type: &str,
51 ) -> Result<ExtractedContent, ExtractError> {
52 Err(ExtractError::UnsupportedType(
53 "byte extraction not supported".to_string(),
54 ))
55 }
56}
57
58// ============================================================================
59// Chunking
60// ============================================================================
61
62/// Trait for splitting content into chunks.
63#[async_trait]
64pub trait Chunker: Send + Sync {
65 /// Name of this chunking strategy.
66 fn name(&self) -> &str;
67
68 /// Content types this chunker is designed for.
69 fn content_types(&self) -> &[&str];
70
71 /// Check if this chunker can handle the given content type.
72 fn can_chunk(&self, content_type: &ContentType) -> bool;
73
74 /// Chunk the extracted content.
75 async fn chunk(
76 &self,
77 content: &ExtractedContent,
78 config: &ChunkConfig,
79 ) -> Result<Vec<ChunkOutput>, ChunkError>;
80}
81
82// ============================================================================
83// Embedding
84// ============================================================================
85
86/// Trait for generating embeddings.
87#[async_trait]
88pub trait Embedder: Send + Sync {
89 /// Model name/identifier.
90 fn model_name(&self) -> &str;
91
92 /// Embedding dimension.
93 fn dimension(&self) -> usize;
94
95 /// Maximum tokens per input.
96 fn max_tokens(&self) -> usize;
97
98 /// Supported modalities.
99 fn modalities(&self) -> &[Modality];
100
101 /// Embed text content.
102 async fn embed_text(
103 &self,
104 texts: &[&str],
105 config: &EmbeddingConfig,
106 ) -> Result<Vec<EmbeddingOutput>, EmbedError>;
107
108 /// Embed image content.
109 async fn embed_image(
110 &self,
111 _image_data: &[u8],
112 _config: &EmbeddingConfig,
113 ) -> Result<EmbeddingOutput, EmbedError> {
114 Err(EmbedError::ModalityNotSupported(Modality::Image))
115 }
116
117 /// Embed a query (may use different instruction).
118 async fn embed_query(
119 &self,
120 query: &str,
121 config: &EmbeddingConfig,
122 ) -> Result<EmbeddingOutput, EmbedError> {
123 let results = self.embed_text(&[query], config).await?;
124 results
125 .into_iter()
126 .next()
127 .ok_or_else(|| EmbedError::Inference("empty embedding result".to_string()))
128 }
129}
130
131// ============================================================================
132// Vector Storage
133// ============================================================================
134
135/// Trait for vector storage and search.
136#[async_trait]
137pub trait VectorStore: Send + Sync {
138 /// Initialize the store.
139 async fn init(&self) -> Result<(), StoreError>;
140
141 /// Insert or update chunks.
142 async fn upsert_chunks(&self, chunks: &[Chunk]) -> Result<(), StoreError>;
143
144 /// Search for similar chunks.
145 async fn search(&self, query: SearchQuery) -> Result<Vec<SearchResult>, StoreError>;
146
147 /// Hybrid search (vector + full-text).
148 async fn hybrid_search(&self, query: SearchQuery) -> Result<Vec<SearchResult>, StoreError>;
149
150 /// Delete all chunks for a file.
151 async fn delete_by_file_path(&self, path: &Path) -> Result<u64, StoreError>;
152
153 /// Update file path for all chunks (for renames).
154 async fn update_file_path(&self, from: &Path, to: &Path) -> Result<u64, StoreError>;
155
156 /// Get all chunks for a file.
157 async fn get_chunks_for_file(&self, path: &Path) -> Result<Vec<Chunk>, StoreError>;
158
159 /// Get file record.
160 async fn get_file(&self, path: &Path) -> Result<Option<FileRecord>, StoreError>;
161
162 /// Upsert file record.
163 async fn upsert_file(&self, record: &FileRecord) -> Result<(), StoreError>;
164
165 /// Get store statistics.
166 async fn stats(&self) -> Result<StoreStats, StoreError>;
167
168 /// Get all chunks in the store.
169 ///
170 /// This is useful for operations that need to iterate over all indexed content,
171 /// such as duplicate detection or bulk analysis.
172 async fn get_all_chunks(&self) -> Result<Vec<Chunk>, StoreError>;
173
174 /// Get all file records in the store.
175 ///
176 /// This is useful for operations that need to iterate over all indexed files,
177 /// such as organization planning or cleanup analysis.
178 async fn get_all_files(&self) -> Result<Vec<FileRecord>, StoreError>;
179}
180
181// ============================================================================
182// Indexer
183// ============================================================================
184
185/// Trait for file indexing coordination.
186#[async_trait]
187pub trait Indexer: Send + Sync {
188 /// Start watching a directory for changes.
189 async fn watch(&self, path: &Path) -> Result<(), crate::Error>;
190
191 /// Stop watching.
192 async fn stop(&self) -> Result<(), crate::Error>;
193
194 /// Manually trigger indexing of a path.
195 async fn index(&self, path: &Path, force: bool) -> Result<(), crate::Error>;
196
197 /// Get current index statistics.
198 async fn stats(&self) -> Result<IndexStats, crate::Error>;
199
200 /// Check if a file needs re-indexing.
201 async fn needs_reindex(&self, path: &Path) -> Result<bool, crate::Error>;
202}