ragfs_chunker/
registry.rs

1//! Chunker registry for managing chunking strategies.
2
3use ragfs_core::{ChunkConfig, ChunkError, ChunkOutput, Chunker, ContentType, ExtractedContent};
4use std::collections::HashMap;
5use std::sync::Arc;
6
7/// Registry of chunking strategies.
8pub struct ChunkerRegistry {
9    /// Named chunkers
10    chunkers: HashMap<String, Arc<dyn Chunker>>,
11    /// Content type to chunker name mapping
12    type_mapping: HashMap<String, String>,
13    /// Default chunker name
14    default_chunker: Option<String>,
15}
16
17impl ChunkerRegistry {
18    /// Create a new empty registry.
19    #[must_use]
20    pub fn new() -> Self {
21        Self {
22            chunkers: HashMap::new(),
23            type_mapping: HashMap::new(),
24            default_chunker: None,
25        }
26    }
27
28    /// Register a chunker.
29    pub fn register<C: Chunker + 'static>(&mut self, name: &str, chunker: C) {
30        let chunker = Arc::new(chunker);
31        for content_type in chunker.content_types() {
32            self.type_mapping
33                .insert((*content_type).to_string(), name.to_string());
34        }
35        self.chunkers.insert(name.to_string(), chunker);
36    }
37
38    /// Set the default chunker.
39    pub fn set_default(&mut self, name: &str) {
40        self.default_chunker = Some(name.to_string());
41    }
42
43    /// Get a chunker for a content type.
44    #[must_use]
45    pub fn get_for_content_type(&self, content_type: &ContentType) -> Option<Arc<dyn Chunker>> {
46        // Try to find specific chunker
47        for chunker in self.chunkers.values() {
48            if chunker.can_chunk(content_type) {
49                return Some(chunker.clone());
50            }
51        }
52
53        // Fall back to default
54        self.default_chunker
55            .as_ref()
56            .and_then(|name| self.chunkers.get(name))
57            .cloned()
58    }
59
60    /// Chunk content using appropriate strategy.
61    pub async fn chunk(
62        &self,
63        content: &ExtractedContent,
64        content_type: &ContentType,
65        config: &ChunkConfig,
66    ) -> Result<Vec<ChunkOutput>, ChunkError> {
67        let chunker = self
68            .get_for_content_type(content_type)
69            .ok_or_else(|| ChunkError::Failed("no suitable chunker found".to_string()))?;
70
71        chunker.chunk(content, config).await
72    }
73}
74
75impl Default for ChunkerRegistry {
76    fn default() -> Self {
77        Self::new()
78    }
79}
80
81#[cfg(test)]
82mod tests {
83    use super::*;
84    use crate::FixedSizeChunker;
85    use ragfs_core::ContentMetadataInfo;
86
87    fn create_test_content(text: &str) -> ExtractedContent {
88        ExtractedContent {
89            text: text.to_string(),
90            elements: vec![],
91            images: vec![],
92            metadata: ContentMetadataInfo::default(),
93        }
94    }
95
96    #[test]
97    fn test_new_registry_is_empty() {
98        let registry = ChunkerRegistry::new();
99        assert!(registry.chunkers.is_empty());
100        assert!(registry.type_mapping.is_empty());
101        assert!(registry.default_chunker.is_none());
102    }
103
104    #[test]
105    fn test_default_implementation() {
106        let registry = ChunkerRegistry::default();
107        assert!(registry.chunkers.is_empty());
108        assert!(registry.default_chunker.is_none());
109    }
110
111    #[test]
112    fn test_register_chunker() {
113        let mut registry = ChunkerRegistry::new();
114        registry.register("fixed", FixedSizeChunker::new());
115
116        assert!(registry.chunkers.contains_key("fixed"));
117        // FixedSizeChunker registers text, code, markdown
118        assert!(registry.type_mapping.contains_key("text"));
119    }
120
121    #[test]
122    fn test_set_default_chunker() {
123        let mut registry = ChunkerRegistry::new();
124        registry.register("fixed", FixedSizeChunker::new());
125        registry.set_default("fixed");
126
127        assert_eq!(registry.default_chunker, Some("fixed".to_string()));
128    }
129
130    #[test]
131    fn test_get_for_content_type_text() {
132        let mut registry = ChunkerRegistry::new();
133        registry.register("fixed", FixedSizeChunker::new());
134
135        let chunker = registry.get_for_content_type(&ContentType::Text);
136        assert!(chunker.is_some());
137    }
138
139    #[test]
140    fn test_get_for_content_type_markdown() {
141        let mut registry = ChunkerRegistry::new();
142        registry.register("fixed", FixedSizeChunker::new());
143
144        let chunker = registry.get_for_content_type(&ContentType::Markdown);
145        assert!(chunker.is_some());
146    }
147
148    #[test]
149    fn test_get_for_content_type_code() {
150        let mut registry = ChunkerRegistry::new();
151        registry.register("fixed", FixedSizeChunker::new());
152
153        let chunker = registry.get_for_content_type(&ContentType::Code {
154            language: "rust".to_string(),
155            symbol: None,
156        });
157        assert!(chunker.is_some());
158    }
159
160    #[test]
161    fn test_get_for_content_type_falls_back_to_default() {
162        let mut registry = ChunkerRegistry::new();
163        registry.register("fixed", FixedSizeChunker::new());
164        registry.set_default("fixed");
165
166        // FixedSizeChunker can handle any type, but this tests the fallback logic
167        let chunker = registry.get_for_content_type(&ContentType::Text);
168        assert!(chunker.is_some());
169    }
170
171    #[test]
172    fn test_get_for_content_type_none_when_no_match() {
173        let registry = ChunkerRegistry::new();
174        // Empty registry with no default
175        let chunker = registry.get_for_content_type(&ContentType::Text);
176        assert!(chunker.is_none());
177    }
178
179    #[tokio::test]
180    async fn test_chunk_success() {
181        let mut registry = ChunkerRegistry::new();
182        registry.register("fixed", FixedSizeChunker::new());
183
184        let content = create_test_content("Hello, world!");
185        let config = ChunkConfig::default();
186
187        let result = registry.chunk(&content, &ContentType::Text, &config).await;
188        assert!(result.is_ok());
189
190        let chunks = result.unwrap();
191        assert_eq!(chunks.len(), 1);
192        assert_eq!(chunks[0].content, "Hello, world!");
193    }
194
195    #[tokio::test]
196    async fn test_chunk_long_text() {
197        let mut registry = ChunkerRegistry::new();
198        registry.register("fixed", FixedSizeChunker::new());
199
200        let text = "A".repeat(3000);
201        let content = create_test_content(&text);
202        let config = ChunkConfig {
203            target_size: 256,
204            max_size: 512,
205            overlap: 32,
206            ..Default::default()
207        };
208
209        let result = registry.chunk(&content, &ContentType::Text, &config).await;
210        assert!(result.is_ok());
211
212        let chunks = result.unwrap();
213        assert!(chunks.len() > 1, "Long text should produce multiple chunks");
214    }
215
216    #[tokio::test]
217    async fn test_chunk_empty_text() {
218        let mut registry = ChunkerRegistry::new();
219        registry.register("fixed", FixedSizeChunker::new());
220
221        let content = create_test_content("");
222        let config = ChunkConfig::default();
223
224        let result = registry.chunk(&content, &ContentType::Text, &config).await;
225        assert!(result.is_ok());
226        assert!(result.unwrap().is_empty());
227    }
228
229    #[tokio::test]
230    async fn test_chunk_fails_without_chunker() {
231        let registry = ChunkerRegistry::new();
232
233        let content = create_test_content("Hello");
234        let config = ChunkConfig::default();
235
236        let result = registry.chunk(&content, &ContentType::Text, &config).await;
237        assert!(result.is_err());
238
239        match result.unwrap_err() {
240            ChunkError::Failed(msg) => {
241                assert!(msg.contains("no suitable chunker"));
242            }
243            _ => panic!("Expected ChunkError::Failed"),
244        }
245    }
246
247    #[test]
248    fn test_multiple_chunkers() {
249        let mut registry = ChunkerRegistry::new();
250        registry.register("fixed1", FixedSizeChunker::new());
251        registry.register("fixed2", FixedSizeChunker::new());
252
253        assert_eq!(registry.chunkers.len(), 2);
254    }
255
256    #[test]
257    fn test_chunker_overrides_type_mapping() {
258        let mut registry = ChunkerRegistry::new();
259        registry.register("first", FixedSizeChunker::new());
260        registry.register("second", FixedSizeChunker::new());
261
262        // Second registration should override the type mapping
263        assert_eq!(
264            registry.type_mapping.get("text"),
265            Some(&"second".to_string())
266        );
267    }
268}