ragfs_chunker/
registry.rs1use ragfs_core::{ChunkConfig, ChunkError, ChunkOutput, Chunker, ContentType, ExtractedContent};
4use std::collections::HashMap;
5use std::sync::Arc;
6
7pub struct ChunkerRegistry {
9 chunkers: HashMap<String, Arc<dyn Chunker>>,
11 type_mapping: HashMap<String, String>,
13 default_chunker: Option<String>,
15}
16
17impl ChunkerRegistry {
18 #[must_use]
20 pub fn new() -> Self {
21 Self {
22 chunkers: HashMap::new(),
23 type_mapping: HashMap::new(),
24 default_chunker: None,
25 }
26 }
27
28 pub fn register<C: Chunker + 'static>(&mut self, name: &str, chunker: C) {
30 let chunker = Arc::new(chunker);
31 for content_type in chunker.content_types() {
32 self.type_mapping
33 .insert((*content_type).to_string(), name.to_string());
34 }
35 self.chunkers.insert(name.to_string(), chunker);
36 }
37
38 pub fn set_default(&mut self, name: &str) {
40 self.default_chunker = Some(name.to_string());
41 }
42
43 #[must_use]
45 pub fn get_for_content_type(&self, content_type: &ContentType) -> Option<Arc<dyn Chunker>> {
46 for chunker in self.chunkers.values() {
48 if chunker.can_chunk(content_type) {
49 return Some(chunker.clone());
50 }
51 }
52
53 self.default_chunker
55 .as_ref()
56 .and_then(|name| self.chunkers.get(name))
57 .cloned()
58 }
59
60 pub async fn chunk(
62 &self,
63 content: &ExtractedContent,
64 content_type: &ContentType,
65 config: &ChunkConfig,
66 ) -> Result<Vec<ChunkOutput>, ChunkError> {
67 let chunker = self
68 .get_for_content_type(content_type)
69 .ok_or_else(|| ChunkError::Failed("no suitable chunker found".to_string()))?;
70
71 chunker.chunk(content, config).await
72 }
73}
74
75impl Default for ChunkerRegistry {
76 fn default() -> Self {
77 Self::new()
78 }
79}
80
81#[cfg(test)]
82mod tests {
83 use super::*;
84 use crate::FixedSizeChunker;
85 use ragfs_core::ContentMetadataInfo;
86
87 fn create_test_content(text: &str) -> ExtractedContent {
88 ExtractedContent {
89 text: text.to_string(),
90 elements: vec![],
91 images: vec![],
92 metadata: ContentMetadataInfo::default(),
93 }
94 }
95
96 #[test]
97 fn test_new_registry_is_empty() {
98 let registry = ChunkerRegistry::new();
99 assert!(registry.chunkers.is_empty());
100 assert!(registry.type_mapping.is_empty());
101 assert!(registry.default_chunker.is_none());
102 }
103
104 #[test]
105 fn test_default_implementation() {
106 let registry = ChunkerRegistry::default();
107 assert!(registry.chunkers.is_empty());
108 assert!(registry.default_chunker.is_none());
109 }
110
111 #[test]
112 fn test_register_chunker() {
113 let mut registry = ChunkerRegistry::new();
114 registry.register("fixed", FixedSizeChunker::new());
115
116 assert!(registry.chunkers.contains_key("fixed"));
117 assert!(registry.type_mapping.contains_key("text"));
119 }
120
121 #[test]
122 fn test_set_default_chunker() {
123 let mut registry = ChunkerRegistry::new();
124 registry.register("fixed", FixedSizeChunker::new());
125 registry.set_default("fixed");
126
127 assert_eq!(registry.default_chunker, Some("fixed".to_string()));
128 }
129
130 #[test]
131 fn test_get_for_content_type_text() {
132 let mut registry = ChunkerRegistry::new();
133 registry.register("fixed", FixedSizeChunker::new());
134
135 let chunker = registry.get_for_content_type(&ContentType::Text);
136 assert!(chunker.is_some());
137 }
138
139 #[test]
140 fn test_get_for_content_type_markdown() {
141 let mut registry = ChunkerRegistry::new();
142 registry.register("fixed", FixedSizeChunker::new());
143
144 let chunker = registry.get_for_content_type(&ContentType::Markdown);
145 assert!(chunker.is_some());
146 }
147
148 #[test]
149 fn test_get_for_content_type_code() {
150 let mut registry = ChunkerRegistry::new();
151 registry.register("fixed", FixedSizeChunker::new());
152
153 let chunker = registry.get_for_content_type(&ContentType::Code {
154 language: "rust".to_string(),
155 symbol: None,
156 });
157 assert!(chunker.is_some());
158 }
159
160 #[test]
161 fn test_get_for_content_type_falls_back_to_default() {
162 let mut registry = ChunkerRegistry::new();
163 registry.register("fixed", FixedSizeChunker::new());
164 registry.set_default("fixed");
165
166 let chunker = registry.get_for_content_type(&ContentType::Text);
168 assert!(chunker.is_some());
169 }
170
171 #[test]
172 fn test_get_for_content_type_none_when_no_match() {
173 let registry = ChunkerRegistry::new();
174 let chunker = registry.get_for_content_type(&ContentType::Text);
176 assert!(chunker.is_none());
177 }
178
179 #[tokio::test]
180 async fn test_chunk_success() {
181 let mut registry = ChunkerRegistry::new();
182 registry.register("fixed", FixedSizeChunker::new());
183
184 let content = create_test_content("Hello, world!");
185 let config = ChunkConfig::default();
186
187 let result = registry.chunk(&content, &ContentType::Text, &config).await;
188 assert!(result.is_ok());
189
190 let chunks = result.unwrap();
191 assert_eq!(chunks.len(), 1);
192 assert_eq!(chunks[0].content, "Hello, world!");
193 }
194
195 #[tokio::test]
196 async fn test_chunk_long_text() {
197 let mut registry = ChunkerRegistry::new();
198 registry.register("fixed", FixedSizeChunker::new());
199
200 let text = "A".repeat(3000);
201 let content = create_test_content(&text);
202 let config = ChunkConfig {
203 target_size: 256,
204 max_size: 512,
205 overlap: 32,
206 ..Default::default()
207 };
208
209 let result = registry.chunk(&content, &ContentType::Text, &config).await;
210 assert!(result.is_ok());
211
212 let chunks = result.unwrap();
213 assert!(chunks.len() > 1, "Long text should produce multiple chunks");
214 }
215
216 #[tokio::test]
217 async fn test_chunk_empty_text() {
218 let mut registry = ChunkerRegistry::new();
219 registry.register("fixed", FixedSizeChunker::new());
220
221 let content = create_test_content("");
222 let config = ChunkConfig::default();
223
224 let result = registry.chunk(&content, &ContentType::Text, &config).await;
225 assert!(result.is_ok());
226 assert!(result.unwrap().is_empty());
227 }
228
229 #[tokio::test]
230 async fn test_chunk_fails_without_chunker() {
231 let registry = ChunkerRegistry::new();
232
233 let content = create_test_content("Hello");
234 let config = ChunkConfig::default();
235
236 let result = registry.chunk(&content, &ContentType::Text, &config).await;
237 assert!(result.is_err());
238
239 match result.unwrap_err() {
240 ChunkError::Failed(msg) => {
241 assert!(msg.contains("no suitable chunker"));
242 }
243 _ => panic!("Expected ChunkError::Failed"),
244 }
245 }
246
247 #[test]
248 fn test_multiple_chunkers() {
249 let mut registry = ChunkerRegistry::new();
250 registry.register("fixed1", FixedSizeChunker::new());
251 registry.register("fixed2", FixedSizeChunker::new());
252
253 assert_eq!(registry.chunkers.len(), 2);
254 }
255
256 #[test]
257 fn test_chunker_overrides_type_mapping() {
258 let mut registry = ChunkerRegistry::new();
259 registry.register("first", FixedSizeChunker::new());
260 registry.register("second", FixedSizeChunker::new());
261
262 assert_eq!(
264 registry.type_mapping.get("text"),
265 Some(&"second".to_string())
266 );
267 }
268}