ragfs_chunker/
fixed.rs

1//! Fixed-size chunking strategy with overlap.
2
3use async_trait::async_trait;
4use ragfs_core::{
5    ChunkConfig, ChunkError, ChunkOutput, ChunkOutputMetadata, Chunker, ContentType,
6    ExtractedContent,
7};
8
9/// Fixed-size chunker with configurable overlap.
10pub struct FixedSizeChunker;
11
12impl FixedSizeChunker {
13    /// Create a new fixed-size chunker.
14    #[must_use]
15    pub fn new() -> Self {
16        Self
17    }
18}
19
20impl Default for FixedSizeChunker {
21    fn default() -> Self {
22        Self::new()
23    }
24}
25
26#[async_trait]
27impl Chunker for FixedSizeChunker {
28    fn name(&self) -> &'static str {
29        "fixed_size"
30    }
31
32    fn content_types(&self) -> &[&str] {
33        &["text", "code", "markdown"]
34    }
35
36    fn can_chunk(&self, _content_type: &ContentType) -> bool {
37        // Can handle any content type as fallback
38        true
39    }
40
41    async fn chunk(
42        &self,
43        content: &ExtractedContent,
44        config: &ChunkConfig,
45    ) -> Result<Vec<ChunkOutput>, ChunkError> {
46        let text = &content.text;
47        if text.is_empty() {
48            return Ok(vec![]);
49        }
50
51        let mut chunks = Vec::new();
52        let chars: Vec<char> = text.chars().collect();
53        let total_chars = chars.len();
54
55        // Approximate chars per token (rough estimate)
56        let chars_per_token = 4;
57        let target_chars = config.target_size * chars_per_token;
58        let overlap_chars = config.overlap * chars_per_token;
59        let step = target_chars.saturating_sub(overlap_chars).max(1);
60
61        let mut start = 0;
62        while start < total_chars {
63            let end = (start + target_chars).min(total_chars);
64
65            // Try to find a good break point (newline or sentence end)
66            let actual_end = find_break_point(&chars, start, end, total_chars);
67
68            let chunk_text: String = chars[start..actual_end].iter().collect();
69            let byte_start = text.char_indices().nth(start).map_or(0, |(i, _)| i) as u64;
70            let byte_end = text
71                .char_indices()
72                .nth(actual_end)
73                .map_or(text.len(), |(i, _)| i) as u64;
74
75            // Count lines
76            let line_start = text[..byte_start as usize].matches('\n').count() as u32;
77            let line_end = line_start + chunk_text.matches('\n').count() as u32;
78
79            chunks.push(ChunkOutput {
80                content: chunk_text,
81                byte_range: byte_start..byte_end,
82                line_range: Some(line_start..line_end),
83                parent_index: None,
84                depth: 0,
85                metadata: ChunkOutputMetadata {
86                    language: content.metadata.language.clone(),
87                    ..Default::default()
88                },
89            });
90
91            start += step;
92            if actual_end >= total_chars {
93                break;
94            }
95        }
96
97        Ok(chunks)
98    }
99}
100
101/// Find a good break point near the target end position.
102fn find_break_point(chars: &[char], start: usize, target_end: usize, total: usize) -> usize {
103    if target_end >= total {
104        return total;
105    }
106
107    // Look for newline within 20% of target
108    let search_start = target_end.saturating_sub((target_end - start) / 5);
109    let search_end = (target_end + (target_end - start) / 10).min(total);
110
111    // Prefer double newline (paragraph break)
112    for i in (search_start..search_end).rev() {
113        if i + 1 < total && chars[i] == '\n' && chars[i + 1] == '\n' {
114            return i + 2;
115        }
116    }
117
118    // Then single newline
119    for i in (search_start..search_end).rev() {
120        if chars[i] == '\n' {
121            return i + 1;
122        }
123    }
124
125    // Then sentence end
126    for i in (search_start..search_end).rev() {
127        if (chars[i] == '.' || chars[i] == '!' || chars[i] == '?')
128            && i + 1 < total
129            && chars[i + 1].is_whitespace()
130        {
131            return i + 1;
132        }
133    }
134
135    // Fall back to target
136    target_end
137}
138
139#[cfg(test)]
140mod tests {
141    use super::*;
142    use ragfs_core::ContentMetadataInfo;
143
144    fn create_test_content(text: &str) -> ExtractedContent {
145        ExtractedContent {
146            text: text.to_string(),
147            elements: vec![],
148            images: vec![],
149            metadata: ContentMetadataInfo::default(),
150        }
151    }
152
153    #[tokio::test]
154    async fn test_chunk_empty_text() {
155        let chunker = FixedSizeChunker::new();
156        let content = create_test_content("");
157        let config = ChunkConfig::default();
158
159        let chunks = chunker.chunk(&content, &config).await.unwrap();
160
161        assert!(chunks.is_empty());
162    }
163
164    #[tokio::test]
165    async fn test_chunk_short_text() {
166        let chunker = FixedSizeChunker::new();
167        let content = create_test_content("This is a short text.");
168        let config = ChunkConfig {
169            target_size: 512,
170            max_size: 1024,
171            overlap: 64,
172            ..Default::default()
173        };
174
175        let chunks = chunker.chunk(&content, &config).await.unwrap();
176
177        assert_eq!(chunks.len(), 1);
178        assert_eq!(chunks[0].content, "This is a short text.");
179        assert_eq!(chunks[0].byte_range.start, 0);
180        assert_eq!(chunks[0].depth, 0);
181    }
182
183    #[tokio::test]
184    async fn test_chunk_long_text() {
185        let chunker = FixedSizeChunker::new();
186        // Create a text that's longer than target size
187        let text = "A".repeat(3000); // ~750 tokens with 4 chars/token estimate
188        let content = create_test_content(&text);
189        let config = ChunkConfig {
190            target_size: 256, // Small target to force multiple chunks
191            max_size: 512,
192            overlap: 32,
193            ..Default::default()
194        };
195
196        let chunks = chunker.chunk(&content, &config).await.unwrap();
197
198        assert!(chunks.len() > 1, "Should create multiple chunks");
199        // Verify all content is covered
200        let total_content: String = chunks.iter().map(|c| c.content.clone()).collect();
201        assert!(
202            total_content.len() >= text.len(),
203            "Chunks should cover all content (with possible overlap)"
204        );
205    }
206
207    #[tokio::test]
208    async fn test_chunk_with_overlap() {
209        let chunker = FixedSizeChunker::new();
210        let text = "Word ".repeat(200); // Create text that will be split
211        let content = create_test_content(&text);
212        let config = ChunkConfig {
213            target_size: 100, // ~400 chars
214            max_size: 200,
215            overlap: 25, // ~100 chars overlap
216            ..Default::default()
217        };
218
219        let chunks = chunker.chunk(&content, &config).await.unwrap();
220
221        // With overlap, consecutive chunks should share some content
222        if chunks.len() >= 2 {
223            let first_end = &chunks[0].content[chunks[0].content.len().saturating_sub(50)..];
224            let second_start = &chunks[1].content[..50.min(chunks[1].content.len())];
225            // There should be some overlapping words (due to word boundary seeking)
226            // This is a weak assertion due to break point logic
227            assert!(!first_end.is_empty());
228            assert!(!second_start.is_empty());
229        }
230    }
231
232    #[tokio::test]
233    async fn test_chunk_respects_paragraph_breaks() {
234        let chunker = FixedSizeChunker::new();
235        let text = format!(
236            "{}\n\n{}",
237            "First paragraph. ".repeat(50),
238            "Second paragraph. ".repeat(50)
239        );
240        let content = create_test_content(&text);
241        let config = ChunkConfig {
242            target_size: 200,
243            max_size: 400,
244            overlap: 20,
245            ..Default::default()
246        };
247
248        let chunks = chunker.chunk(&content, &config).await.unwrap();
249
250        // Should prefer to break at paragraph boundaries
251        assert!(!chunks.is_empty());
252        // Check that at least one chunk ends near a paragraph break
253        let _has_clean_break = chunks
254            .iter()
255            .any(|c| c.content.ends_with("\n\n") || c.content.ends_with('\n'));
256        // This might not always be true depending on text length, so we just verify chunks exist
257        assert!(!chunks.is_empty());
258    }
259
260    #[tokio::test]
261    async fn test_chunk_line_ranges() {
262        let chunker = FixedSizeChunker::new();
263        let text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";
264        let content = create_test_content(text);
265        let config = ChunkConfig {
266            target_size: 512, // Large enough for all text
267            max_size: 1024,
268            overlap: 0,
269            ..Default::default()
270        };
271
272        let chunks = chunker.chunk(&content, &config).await.unwrap();
273
274        assert_eq!(chunks.len(), 1);
275        assert!(chunks[0].line_range.is_some());
276        let line_range = chunks[0].line_range.as_ref().unwrap();
277        assert_eq!(line_range.start, 0);
278        // 4 newlines in text = lines 0-4
279        assert_eq!(line_range.end, 4);
280    }
281
282    #[tokio::test]
283    async fn test_chunk_byte_ranges() {
284        let chunker = FixedSizeChunker::new();
285        let text = "Hello, world!";
286        let content = create_test_content(text);
287        let config = ChunkConfig::default();
288
289        let chunks = chunker.chunk(&content, &config).await.unwrap();
290
291        assert_eq!(chunks.len(), 1);
292        assert_eq!(chunks[0].byte_range.start, 0);
293        assert_eq!(chunks[0].byte_range.end, text.len() as u64);
294    }
295
296    #[tokio::test]
297    async fn test_chunk_unicode_text() {
298        let chunker = FixedSizeChunker::new();
299        let text = "Hello 世界! 🌍 Привет мир! مرحبا";
300        let content = create_test_content(text);
301        let config = ChunkConfig::default();
302
303        let chunks = chunker.chunk(&content, &config).await.unwrap();
304
305        assert_eq!(chunks.len(), 1);
306        assert_eq!(chunks[0].content, text);
307        // Verify byte range is correct for UTF-8
308        assert!(chunks[0].byte_range.end as usize == text.len());
309    }
310
311    #[test]
312    fn test_chunker_name() {
313        let chunker = FixedSizeChunker::new();
314        assert_eq!(chunker.name(), "fixed_size");
315    }
316
317    #[test]
318    fn test_chunker_content_types() {
319        let chunker = FixedSizeChunker::new();
320        let types = chunker.content_types();
321        assert!(types.contains(&"text"));
322        assert!(types.contains(&"code"));
323        assert!(types.contains(&"markdown"));
324    }
325
326    #[test]
327    fn test_can_chunk_any_type() {
328        let chunker = FixedSizeChunker::new();
329
330        assert!(chunker.can_chunk(&ContentType::Text));
331        assert!(chunker.can_chunk(&ContentType::Markdown));
332        assert!(chunker.can_chunk(&ContentType::Code {
333            language: "rust".to_string(),
334            symbol: None,
335        }));
336    }
337
338    #[test]
339    fn test_find_break_point_at_end() {
340        let chars: Vec<char> = "Hello world".chars().collect();
341        let result = find_break_point(&chars, 0, 20, chars.len());
342        assert_eq!(result, chars.len());
343    }
344
345    #[test]
346    fn test_find_break_point_at_newline() {
347        let chars: Vec<char> = "Hello\nworld".chars().collect();
348        let result = find_break_point(&chars, 0, 6, chars.len());
349        // Should find newline at position 5 and return 6
350        assert_eq!(result, 6);
351    }
352
353    #[test]
354    fn test_find_break_point_at_paragraph() {
355        let chars: Vec<char> = "Hello\n\nworld".chars().collect();
356        let result = find_break_point(&chars, 0, 7, chars.len());
357        // Should prefer paragraph break
358        assert_eq!(result, 7);
359    }
360
361    #[test]
362    fn test_default_implementation() {
363        let chunker = FixedSizeChunker;
364        assert_eq!(chunker.name(), "fixed_size");
365    }
366}