1use async_trait::async_trait;
4use ragfs_core::{
5 ChunkConfig, ChunkError, ChunkOutput, ChunkOutputMetadata, Chunker, ContentType,
6 ExtractedContent,
7};
8
9pub struct FixedSizeChunker;
11
12impl FixedSizeChunker {
13 #[must_use]
15 pub fn new() -> Self {
16 Self
17 }
18}
19
20impl Default for FixedSizeChunker {
21 fn default() -> Self {
22 Self::new()
23 }
24}
25
26#[async_trait]
27impl Chunker for FixedSizeChunker {
28 fn name(&self) -> &'static str {
29 "fixed_size"
30 }
31
32 fn content_types(&self) -> &[&str] {
33 &["text", "code", "markdown"]
34 }
35
36 fn can_chunk(&self, _content_type: &ContentType) -> bool {
37 true
39 }
40
41 async fn chunk(
42 &self,
43 content: &ExtractedContent,
44 config: &ChunkConfig,
45 ) -> Result<Vec<ChunkOutput>, ChunkError> {
46 let text = &content.text;
47 if text.is_empty() {
48 return Ok(vec![]);
49 }
50
51 let mut chunks = Vec::new();
52 let chars: Vec<char> = text.chars().collect();
53 let total_chars = chars.len();
54
55 let chars_per_token = 4;
57 let target_chars = config.target_size * chars_per_token;
58 let overlap_chars = config.overlap * chars_per_token;
59 let step = target_chars.saturating_sub(overlap_chars).max(1);
60
61 let mut start = 0;
62 while start < total_chars {
63 let end = (start + target_chars).min(total_chars);
64
65 let actual_end = find_break_point(&chars, start, end, total_chars);
67
68 let chunk_text: String = chars[start..actual_end].iter().collect();
69 let byte_start = text.char_indices().nth(start).map_or(0, |(i, _)| i) as u64;
70 let byte_end = text
71 .char_indices()
72 .nth(actual_end)
73 .map_or(text.len(), |(i, _)| i) as u64;
74
75 let line_start = text[..byte_start as usize].matches('\n').count() as u32;
77 let line_end = line_start + chunk_text.matches('\n').count() as u32;
78
79 chunks.push(ChunkOutput {
80 content: chunk_text,
81 byte_range: byte_start..byte_end,
82 line_range: Some(line_start..line_end),
83 parent_index: None,
84 depth: 0,
85 metadata: ChunkOutputMetadata {
86 language: content.metadata.language.clone(),
87 ..Default::default()
88 },
89 });
90
91 start += step;
92 if actual_end >= total_chars {
93 break;
94 }
95 }
96
97 Ok(chunks)
98 }
99}
100
101fn find_break_point(chars: &[char], start: usize, target_end: usize, total: usize) -> usize {
103 if target_end >= total {
104 return total;
105 }
106
107 let search_start = target_end.saturating_sub((target_end - start) / 5);
109 let search_end = (target_end + (target_end - start) / 10).min(total);
110
111 for i in (search_start..search_end).rev() {
113 if i + 1 < total && chars[i] == '\n' && chars[i + 1] == '\n' {
114 return i + 2;
115 }
116 }
117
118 for i in (search_start..search_end).rev() {
120 if chars[i] == '\n' {
121 return i + 1;
122 }
123 }
124
125 for i in (search_start..search_end).rev() {
127 if (chars[i] == '.' || chars[i] == '!' || chars[i] == '?')
128 && i + 1 < total
129 && chars[i + 1].is_whitespace()
130 {
131 return i + 1;
132 }
133 }
134
135 target_end
137}
138
139#[cfg(test)]
140mod tests {
141 use super::*;
142 use ragfs_core::ContentMetadataInfo;
143
144 fn create_test_content(text: &str) -> ExtractedContent {
145 ExtractedContent {
146 text: text.to_string(),
147 elements: vec![],
148 images: vec![],
149 metadata: ContentMetadataInfo::default(),
150 }
151 }
152
153 #[tokio::test]
154 async fn test_chunk_empty_text() {
155 let chunker = FixedSizeChunker::new();
156 let content = create_test_content("");
157 let config = ChunkConfig::default();
158
159 let chunks = chunker.chunk(&content, &config).await.unwrap();
160
161 assert!(chunks.is_empty());
162 }
163
164 #[tokio::test]
165 async fn test_chunk_short_text() {
166 let chunker = FixedSizeChunker::new();
167 let content = create_test_content("This is a short text.");
168 let config = ChunkConfig {
169 target_size: 512,
170 max_size: 1024,
171 overlap: 64,
172 ..Default::default()
173 };
174
175 let chunks = chunker.chunk(&content, &config).await.unwrap();
176
177 assert_eq!(chunks.len(), 1);
178 assert_eq!(chunks[0].content, "This is a short text.");
179 assert_eq!(chunks[0].byte_range.start, 0);
180 assert_eq!(chunks[0].depth, 0);
181 }
182
183 #[tokio::test]
184 async fn test_chunk_long_text() {
185 let chunker = FixedSizeChunker::new();
186 let text = "A".repeat(3000); let content = create_test_content(&text);
189 let config = ChunkConfig {
190 target_size: 256, max_size: 512,
192 overlap: 32,
193 ..Default::default()
194 };
195
196 let chunks = chunker.chunk(&content, &config).await.unwrap();
197
198 assert!(chunks.len() > 1, "Should create multiple chunks");
199 let total_content: String = chunks.iter().map(|c| c.content.clone()).collect();
201 assert!(
202 total_content.len() >= text.len(),
203 "Chunks should cover all content (with possible overlap)"
204 );
205 }
206
207 #[tokio::test]
208 async fn test_chunk_with_overlap() {
209 let chunker = FixedSizeChunker::new();
210 let text = "Word ".repeat(200); let content = create_test_content(&text);
212 let config = ChunkConfig {
213 target_size: 100, max_size: 200,
215 overlap: 25, ..Default::default()
217 };
218
219 let chunks = chunker.chunk(&content, &config).await.unwrap();
220
221 if chunks.len() >= 2 {
223 let first_end = &chunks[0].content[chunks[0].content.len().saturating_sub(50)..];
224 let second_start = &chunks[1].content[..50.min(chunks[1].content.len())];
225 assert!(!first_end.is_empty());
228 assert!(!second_start.is_empty());
229 }
230 }
231
232 #[tokio::test]
233 async fn test_chunk_respects_paragraph_breaks() {
234 let chunker = FixedSizeChunker::new();
235 let text = format!(
236 "{}\n\n{}",
237 "First paragraph. ".repeat(50),
238 "Second paragraph. ".repeat(50)
239 );
240 let content = create_test_content(&text);
241 let config = ChunkConfig {
242 target_size: 200,
243 max_size: 400,
244 overlap: 20,
245 ..Default::default()
246 };
247
248 let chunks = chunker.chunk(&content, &config).await.unwrap();
249
250 assert!(!chunks.is_empty());
252 let _has_clean_break = chunks
254 .iter()
255 .any(|c| c.content.ends_with("\n\n") || c.content.ends_with('\n'));
256 assert!(!chunks.is_empty());
258 }
259
260 #[tokio::test]
261 async fn test_chunk_line_ranges() {
262 let chunker = FixedSizeChunker::new();
263 let text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";
264 let content = create_test_content(text);
265 let config = ChunkConfig {
266 target_size: 512, max_size: 1024,
268 overlap: 0,
269 ..Default::default()
270 };
271
272 let chunks = chunker.chunk(&content, &config).await.unwrap();
273
274 assert_eq!(chunks.len(), 1);
275 assert!(chunks[0].line_range.is_some());
276 let line_range = chunks[0].line_range.as_ref().unwrap();
277 assert_eq!(line_range.start, 0);
278 assert_eq!(line_range.end, 4);
280 }
281
282 #[tokio::test]
283 async fn test_chunk_byte_ranges() {
284 let chunker = FixedSizeChunker::new();
285 let text = "Hello, world!";
286 let content = create_test_content(text);
287 let config = ChunkConfig::default();
288
289 let chunks = chunker.chunk(&content, &config).await.unwrap();
290
291 assert_eq!(chunks.len(), 1);
292 assert_eq!(chunks[0].byte_range.start, 0);
293 assert_eq!(chunks[0].byte_range.end, text.len() as u64);
294 }
295
296 #[tokio::test]
297 async fn test_chunk_unicode_text() {
298 let chunker = FixedSizeChunker::new();
299 let text = "Hello 世界! 🌍 Привет мир! مرحبا";
300 let content = create_test_content(text);
301 let config = ChunkConfig::default();
302
303 let chunks = chunker.chunk(&content, &config).await.unwrap();
304
305 assert_eq!(chunks.len(), 1);
306 assert_eq!(chunks[0].content, text);
307 assert!(chunks[0].byte_range.end as usize == text.len());
309 }
310
311 #[test]
312 fn test_chunker_name() {
313 let chunker = FixedSizeChunker::new();
314 assert_eq!(chunker.name(), "fixed_size");
315 }
316
317 #[test]
318 fn test_chunker_content_types() {
319 let chunker = FixedSizeChunker::new();
320 let types = chunker.content_types();
321 assert!(types.contains(&"text"));
322 assert!(types.contains(&"code"));
323 assert!(types.contains(&"markdown"));
324 }
325
326 #[test]
327 fn test_can_chunk_any_type() {
328 let chunker = FixedSizeChunker::new();
329
330 assert!(chunker.can_chunk(&ContentType::Text));
331 assert!(chunker.can_chunk(&ContentType::Markdown));
332 assert!(chunker.can_chunk(&ContentType::Code {
333 language: "rust".to_string(),
334 symbol: None,
335 }));
336 }
337
338 #[test]
339 fn test_find_break_point_at_end() {
340 let chars: Vec<char> = "Hello world".chars().collect();
341 let result = find_break_point(&chars, 0, 20, chars.len());
342 assert_eq!(result, chars.len());
343 }
344
345 #[test]
346 fn test_find_break_point_at_newline() {
347 let chars: Vec<char> = "Hello\nworld".chars().collect();
348 let result = find_break_point(&chars, 0, 6, chars.len());
349 assert_eq!(result, 6);
351 }
352
353 #[test]
354 fn test_find_break_point_at_paragraph() {
355 let chars: Vec<char> = "Hello\n\nworld".chars().collect();
356 let result = find_break_point(&chars, 0, 7, chars.len());
357 assert_eq!(result, 7);
359 }
360
361 #[test]
362 fn test_default_implementation() {
363 let chunker = FixedSizeChunker;
364 assert_eq!(chunker.name(), "fixed_size");
365 }
366}