1use async_trait::async_trait;
7use ragfs_core::{
8 ChunkConfig, ChunkError, ChunkOutput, ChunkOutputMetadata, Chunker, ContentElement,
9 ContentType, ExtractedContent,
10};
11use tracing::debug;
12
13pub struct SemanticChunker;
15
16impl SemanticChunker {
17 #[must_use]
19 pub fn new() -> Self {
20 Self
21 }
22}
23
24impl Default for SemanticChunker {
25 fn default() -> Self {
26 Self::new()
27 }
28}
29
30#[async_trait]
31impl Chunker for SemanticChunker {
32 fn name(&self) -> &'static str {
33 "semantic"
34 }
35
36 fn content_types(&self) -> &[&str] {
37 &["text", "markdown"]
38 }
39
40 fn can_chunk(&self, content_type: &ContentType) -> bool {
41 matches!(content_type, ContentType::Text | ContentType::Markdown)
42 }
43
44 async fn chunk(
45 &self,
46 content: &ExtractedContent,
47 config: &ChunkConfig,
48 ) -> Result<Vec<ChunkOutput>, ChunkError> {
49 let text = &content.text;
50 if text.is_empty() {
51 return Ok(vec![]);
52 }
53
54 debug!("Semantic chunking {} bytes", text.len());
55
56 if !content.elements.is_empty() {
58 return chunk_from_elements(text, &content.elements, config);
59 }
60
61 let sections = parse_sections(text);
63 chunk_sections(text, §ions, config)
64 }
65}
66
67#[derive(Debug)]
69struct Section {
70 heading: Option<String>,
71 heading_level: u8,
72 start_byte: usize,
73 end_byte: usize,
74 content: String,
75}
76
77fn parse_sections(text: &str) -> Vec<Section> {
79 let mut sections = Vec::new();
80 let mut current_section = Section {
81 heading: None,
82 heading_level: 0,
83 start_byte: 0,
84 end_byte: 0,
85 content: String::new(),
86 };
87
88 let lines: Vec<&str> = text.lines().collect();
89 let mut byte_offset = 0;
90
91 for (i, line) in lines.iter().enumerate() {
92 if let Some((level, heading_text)) = parse_markdown_heading(line) {
94 if !current_section.content.trim().is_empty() || current_section.heading.is_some() {
96 current_section.end_byte = byte_offset;
97 sections.push(current_section);
98 }
99
100 current_section = Section {
102 heading: Some(heading_text),
103 heading_level: level,
104 start_byte: byte_offset,
105 end_byte: 0,
106 content: String::new(),
107 };
108 }
109 else if i > 0 && is_underline_heading(line, &lines[..i]) {
111 let prev_line = lines[i - 1];
112 let level = if line.starts_with('=') { 1 } else { 2 };
113
114 if !sections.is_empty() {
116 let last = sections.last_mut().unwrap();
117 if last.content.ends_with(prev_line) {
119 last.content = last.content[..last.content.len() - prev_line.len()]
120 .trim_end()
121 .to_string();
122 last.end_byte = byte_offset - prev_line.len() - 1;
123 }
124 } else if current_section.content.ends_with(prev_line) {
125 current_section.content = current_section.content
126 [..current_section.content.len() - prev_line.len()]
127 .trim_end()
128 .to_string();
129 }
130
131 if !current_section.content.trim().is_empty() || current_section.heading.is_some() {
133 current_section.end_byte = byte_offset - prev_line.len() - 1;
134 sections.push(current_section);
135 }
136
137 current_section = Section {
139 heading: Some(prev_line.to_string()),
140 heading_level: level,
141 start_byte: byte_offset - prev_line.len() - 1,
142 end_byte: 0,
143 content: String::new(),
144 };
145 } else {
146 if !current_section.content.is_empty() {
148 current_section.content.push('\n');
149 }
150 current_section.content.push_str(line);
151 }
152
153 byte_offset += line.len() + 1; }
155
156 current_section.end_byte = text.len();
158 if !current_section.content.trim().is_empty() || current_section.heading.is_some() {
159 sections.push(current_section);
160 }
161
162 sections
163}
164
165fn parse_markdown_heading(line: &str) -> Option<(u8, String)> {
167 let trimmed = line.trim_start();
168 if !trimmed.starts_with('#') {
169 return None;
170 }
171
172 let hash_count = trimmed.chars().take_while(|c| *c == '#').count();
173 if hash_count > 6 {
174 return None; }
176
177 let rest = &trimmed[hash_count..];
178 if rest.is_empty() || !rest.starts_with(char::is_whitespace) {
179 return None;
180 }
181
182 Some((hash_count as u8, rest.trim().to_string()))
183}
184
185fn is_underline_heading(line: &str, previous_lines: &[&str]) -> bool {
187 let trimmed = line.trim();
188 if trimmed.is_empty() {
189 return false;
190 }
191
192 if trimmed.len() < 3 {
194 return false;
195 }
196
197 let is_equals = trimmed.chars().all(|c| c == '=');
199 let is_dashes = trimmed.chars().all(|c| c == '-');
200
201 if !is_equals && !is_dashes {
202 return false;
203 }
204
205 if previous_lines.is_empty() {
207 return false;
208 }
209
210 let prev = previous_lines.last().unwrap().trim();
211 !prev.is_empty() && !prev.starts_with('#')
212}
213
214fn chunk_sections(
216 text: &str,
217 sections: &[Section],
218 config: &ChunkConfig,
219) -> Result<Vec<ChunkOutput>, ChunkError> {
220 let mut chunks = Vec::new();
221 let chars_per_token = 4;
222 let target_chars = config.target_size * chars_per_token;
223 let max_chars = config.max_size * chars_per_token;
224
225 let mut current_chunk = String::new();
226 let mut chunk_start = 0;
227 let mut current_heading: Option<String> = None;
228
229 for section in sections {
230 let section_text = if let Some(ref heading) = section.heading {
231 format!(
232 "{} {}\n\n{}",
233 "#".repeat(section.heading_level as usize),
234 heading,
235 section.content.trim()
236 )
237 } else {
238 section.content.trim().to_string()
239 };
240
241 if !current_chunk.is_empty() && current_chunk.len() + section_text.len() > max_chars {
243 chunks.push(create_chunk(
244 text,
245 ¤t_chunk,
246 chunk_start,
247 current_heading.take(),
248 ));
249 current_chunk = String::new();
250 chunk_start = section.start_byte;
251 }
252
253 if section_text.len() > max_chars {
255 if !current_chunk.is_empty() {
257 chunks.push(create_chunk(
258 text,
259 ¤t_chunk,
260 chunk_start,
261 current_heading.take(),
262 ));
263 current_chunk = String::new();
264 }
265
266 let sub_chunks = split_large_section(§ion_text, section.start_byte, config)?;
268 for mut sub_chunk in sub_chunks {
269 if chunks.is_empty() || sub_chunk.metadata.symbol_name.is_none() {
271 sub_chunk.metadata.symbol_name = section.heading.clone();
272 }
273 chunks.push(sub_chunk);
274 }
275 chunk_start = section.end_byte;
276 } else {
277 if !current_chunk.is_empty() {
279 current_chunk.push_str("\n\n");
280 }
281 current_chunk.push_str(§ion_text);
282
283 if section.heading.is_some() {
285 current_heading = section.heading.clone();
286 }
287
288 if current_chunk.len() >= target_chars {
290 chunks.push(create_chunk(
291 text,
292 ¤t_chunk,
293 chunk_start,
294 current_heading.take(),
295 ));
296 current_chunk = String::new();
297 chunk_start = section.end_byte;
298 }
299 }
300 }
301
302 if !current_chunk.is_empty() {
304 chunks.push(create_chunk(
305 text,
306 ¤t_chunk,
307 chunk_start,
308 current_heading,
309 ));
310 }
311
312 Ok(chunks)
313}
314
315fn create_chunk(
317 _text: &str,
318 content: &str,
319 start_byte: usize,
320 heading: Option<String>,
321) -> ChunkOutput {
322 let line_count = content.matches('\n').count() as u32;
323
324 ChunkOutput {
325 content: content.to_string(),
326 byte_range: start_byte as u64..(start_byte + content.len()) as u64,
327 line_range: Some(0..line_count),
328 parent_index: None,
329 depth: 0,
330 metadata: ChunkOutputMetadata {
331 symbol_type: heading.as_ref().map(|_| "section".to_string()),
332 symbol_name: heading,
333 language: None,
334 },
335 }
336}
337
338fn split_large_section(
340 text: &str,
341 base_offset: usize,
342 config: &ChunkConfig,
343) -> Result<Vec<ChunkOutput>, ChunkError> {
344 let mut chunks = Vec::new();
345 let chars_per_token = 4;
346 let target_chars = config.target_size * chars_per_token;
347 let overlap_chars = config.overlap * chars_per_token;
348
349 let paragraphs: Vec<&str> = text.split("\n\n").collect();
350 let mut current = String::new();
351 let mut current_offset = base_offset;
352
353 for para in paragraphs {
354 let para = para.trim();
355 if para.is_empty() {
356 continue;
357 }
358
359 if !current.is_empty() && current.len() + para.len() > target_chars {
361 let line_count = current.matches('\n').count() as u32;
362 chunks.push(ChunkOutput {
363 content: current.clone(),
364 byte_range: current_offset as u64..(current_offset + current.len()) as u64,
365 line_range: Some(0..line_count),
366 parent_index: None,
367 depth: 0,
368 metadata: ChunkOutputMetadata::default(),
369 });
370
371 let overlap_start = current.len().saturating_sub(overlap_chars);
373 let overlap = ¤t[overlap_start..];
374 current = format!("{overlap}\n\n{para}");
375 current_offset += overlap_start;
376 } else {
377 if !current.is_empty() {
378 current.push_str("\n\n");
379 }
380 current.push_str(para);
381 }
382 }
383
384 if !current.is_empty() {
386 let line_count = current.matches('\n').count() as u32;
387 chunks.push(ChunkOutput {
388 content: current.clone(),
389 byte_range: current_offset as u64..(current_offset + current.len()) as u64,
390 line_range: Some(0..line_count),
391 parent_index: None,
392 depth: 0,
393 metadata: ChunkOutputMetadata::default(),
394 });
395 }
396
397 Ok(chunks)
398}
399
400fn chunk_from_elements(
402 text: &str,
403 elements: &[ContentElement],
404 config: &ChunkConfig,
405) -> Result<Vec<ChunkOutput>, ChunkError> {
406 let mut chunks = Vec::new();
407 let chars_per_token = 4;
408 let target_chars = config.target_size * chars_per_token;
409 let max_chars = config.max_size * chars_per_token;
410
411 let mut current_chunk = String::new();
412 let mut chunk_start = 0u64;
413 let mut current_heading: Option<String> = None;
414
415 for element in elements {
416 let (elem_text, elem_offset, is_heading) = match element {
417 ContentElement::Heading {
418 level,
419 text,
420 byte_offset,
421 } => {
422 let heading_text = format!("{} {}", "#".repeat(*level as usize), text);
423 (heading_text, *byte_offset, true)
424 }
425 ContentElement::Paragraph { text, byte_offset } => (text.clone(), *byte_offset, false),
426 ContentElement::CodeBlock {
427 language,
428 code,
429 byte_offset,
430 } => {
431 let lang = language.as_deref().unwrap_or("");
432 let block = format!("```{lang}\n{code}\n```");
433 (block, *byte_offset, false)
434 }
435 ContentElement::List {
436 items,
437 ordered,
438 byte_offset,
439 } => {
440 let list_text = items
441 .iter()
442 .enumerate()
443 .map(|(i, item)| {
444 if *ordered {
445 format!("{}. {}", i + 1, item)
446 } else {
447 format!("- {item}")
448 }
449 })
450 .collect::<Vec<_>>()
451 .join("\n");
452 (list_text, *byte_offset, false)
453 }
454 ContentElement::Table {
455 headers,
456 rows,
457 byte_offset,
458 } => {
459 let mut table = String::new();
460 table.push_str(&format!("| {} |\n", headers.join(" | ")));
461 table.push_str(&format!(
462 "| {} |\n",
463 headers
464 .iter()
465 .map(|_| "---")
466 .collect::<Vec<_>>()
467 .join(" | ")
468 ));
469 for row in rows {
470 table.push_str(&format!("| {} |\n", row.join(" | ")));
471 }
472 (table, *byte_offset, false)
473 }
474 };
475
476 if !current_chunk.is_empty() && current_chunk.len() + elem_text.len() + 2 > max_chars {
478 chunks.push(create_chunk(
479 text,
480 ¤t_chunk,
481 chunk_start as usize,
482 current_heading.take(),
483 ));
484 current_chunk = String::new();
485 chunk_start = elem_offset;
486 }
487
488 if !current_chunk.is_empty() {
490 current_chunk.push_str("\n\n");
491 }
492 current_chunk.push_str(&elem_text);
493
494 if is_heading {
495 current_heading = Some(elem_text.trim_start_matches('#').trim().to_string());
496 }
497
498 if current_chunk.len() >= target_chars {
500 chunks.push(create_chunk(
501 text,
502 ¤t_chunk,
503 chunk_start as usize,
504 current_heading.take(),
505 ));
506 current_chunk = String::new();
507 chunk_start = elem_offset + elem_text.len() as u64;
508 }
509 }
510
511 if !current_chunk.is_empty() {
513 chunks.push(create_chunk(
514 text,
515 ¤t_chunk,
516 chunk_start as usize,
517 current_heading,
518 ));
519 }
520
521 Ok(chunks)
522}
523
524#[cfg(test)]
525mod tests {
526 use super::*;
527 use ragfs_core::ContentMetadataInfo;
528
529 fn create_test_content(text: &str) -> ExtractedContent {
530 ExtractedContent {
531 text: text.to_string(),
532 elements: vec![],
533 images: vec![],
534 metadata: ContentMetadataInfo::default(),
535 }
536 }
537
538 fn create_content_with_elements(text: &str, elements: Vec<ContentElement>) -> ExtractedContent {
539 ExtractedContent {
540 text: text.to_string(),
541 elements,
542 images: vec![],
543 metadata: ContentMetadataInfo::default(),
544 }
545 }
546
547 #[test]
548 fn test_parse_markdown_heading() {
549 assert_eq!(
550 parse_markdown_heading("# Title"),
551 Some((1, "Title".to_string()))
552 );
553 assert_eq!(
554 parse_markdown_heading("## Subtitle"),
555 Some((2, "Subtitle".to_string()))
556 );
557 assert_eq!(
558 parse_markdown_heading("### Section"),
559 Some((3, "Section".to_string()))
560 );
561 assert_eq!(parse_markdown_heading("Not a heading"), None);
562 assert_eq!(parse_markdown_heading("#NoSpace"), None);
563 }
564
565 #[test]
566 fn test_is_underline_heading() {
567 assert!(is_underline_heading("===", &["Title"]));
568 assert!(is_underline_heading("---", &["Subtitle"]));
569 assert!(!is_underline_heading("===", &[""]));
570 assert!(!is_underline_heading("---", &[]));
571 }
572
573 #[test]
574 fn test_parse_sections() {
575 let text = "# Introduction\n\nThis is the intro.\n\n## Details\n\nMore details here.";
576 let sections = parse_sections(text);
577
578 assert_eq!(sections.len(), 2);
579 assert_eq!(sections[0].heading, Some("Introduction".to_string()));
580 assert_eq!(sections[1].heading, Some("Details".to_string()));
581 }
582
583 #[test]
584 fn test_semantic_chunker_can_chunk() {
585 let chunker = SemanticChunker::new();
586 assert!(chunker.can_chunk(&ContentType::Text));
587 assert!(chunker.can_chunk(&ContentType::Markdown));
588 assert!(!chunker.can_chunk(&ContentType::Code {
589 language: "rust".to_string(),
590 symbol: None
591 }));
592 }
593
594 #[test]
595 fn test_chunker_name() {
596 let chunker = SemanticChunker::new();
597 assert_eq!(chunker.name(), "semantic");
598 }
599
600 #[test]
601 fn test_chunker_content_types() {
602 let chunker = SemanticChunker::new();
603 let types = chunker.content_types();
604 assert!(types.contains(&"text"));
605 assert!(types.contains(&"markdown"));
606 }
607
608 #[test]
609 fn test_default_implementation() {
610 let chunker = SemanticChunker;
611 assert_eq!(chunker.name(), "semantic");
612 }
613
614 #[tokio::test]
615 async fn test_chunk_empty_text() {
616 let chunker = SemanticChunker::new();
617 let content = create_test_content("");
618 let config = ChunkConfig::default();
619
620 let chunks = chunker.chunk(&content, &config).await.unwrap();
621 assert!(chunks.is_empty());
622 }
623
624 #[tokio::test]
625 async fn test_chunk_simple_text() {
626 let chunker = SemanticChunker::new();
627 let content = create_test_content("This is simple text without headings.");
628 let config = ChunkConfig::default();
629
630 let chunks = chunker.chunk(&content, &config).await.unwrap();
631 assert_eq!(chunks.len(), 1);
632 assert!(chunks[0].content.contains("simple text"));
633 }
634
635 #[tokio::test]
636 async fn test_chunk_markdown_with_headings() {
637 let chunker = SemanticChunker::new();
638 let text = "# Introduction\n\nThis is the intro.\n\n## Details\n\nMore details here.";
639 let content = create_test_content(text);
640 let config = ChunkConfig::default();
641
642 let chunks = chunker.chunk(&content, &config).await.unwrap();
643 assert!(!chunks.is_empty());
644 let all_content: String = chunks.iter().map(|c| c.content.clone()).collect();
646 assert!(all_content.contains("Introduction"));
647 assert!(all_content.contains("Details"));
648 }
649
650 #[tokio::test]
651 async fn test_chunk_with_structured_elements() {
652 let chunker = SemanticChunker::new();
653 let text = "# Title\n\nParagraph content.";
654 let elements = vec![
655 ContentElement::Heading {
656 level: 1,
657 text: "Title".to_string(),
658 byte_offset: 0,
659 },
660 ContentElement::Paragraph {
661 text: "Paragraph content.".to_string(),
662 byte_offset: 9,
663 },
664 ];
665 let content = create_content_with_elements(text, elements);
666 let config = ChunkConfig::default();
667
668 let chunks = chunker.chunk(&content, &config).await.unwrap();
669 assert!(!chunks.is_empty());
670 }
671
672 #[tokio::test]
673 async fn test_chunk_large_text_splits() {
674 let chunker = SemanticChunker::new();
675 let section = "# Section\n\nThis is a paragraph with some content.\n\n";
677 let text = section.repeat(50);
678 let content = create_test_content(&text);
679 let config = ChunkConfig {
680 target_size: 100,
681 max_size: 200,
682 overlap: 20,
683 ..Default::default()
684 };
685
686 let chunks = chunker.chunk(&content, &config).await.unwrap();
687 assert!(
688 chunks.len() > 1,
689 "Large text should produce multiple chunks"
690 );
691 }
692
693 #[tokio::test]
694 async fn test_chunk_preserves_heading_metadata() {
695 let chunker = SemanticChunker::new();
696 let text = "# Important Section\n\nContent under the section.";
697 let content = create_test_content(text);
698 let config = ChunkConfig::default();
699
700 let chunks = chunker.chunk(&content, &config).await.unwrap();
701 assert!(!chunks.is_empty());
702 if let Some(symbol_name) = &chunks[0].metadata.symbol_name {
704 assert!(symbol_name.contains("Important Section"));
705 }
706 }
707
708 #[tokio::test]
709 async fn test_chunk_with_code_block_element() {
710 let chunker = SemanticChunker::new();
711 let text = "# Code Example\n\n```rust\nfn main() {}\n```";
712 let elements = vec![
713 ContentElement::Heading {
714 level: 1,
715 text: "Code Example".to_string(),
716 byte_offset: 0,
717 },
718 ContentElement::CodeBlock {
719 language: Some("rust".to_string()),
720 code: "fn main() {}".to_string(),
721 byte_offset: 16,
722 },
723 ];
724 let content = create_content_with_elements(text, elements);
725 let config = ChunkConfig::default();
726
727 let chunks = chunker.chunk(&content, &config).await.unwrap();
728 assert!(!chunks.is_empty());
729 let all_content: String = chunks.iter().map(|c| c.content.clone()).collect();
730 assert!(all_content.contains("fn main()"));
731 }
732
733 #[tokio::test]
734 async fn test_chunk_with_list_element() {
735 let chunker = SemanticChunker::new();
736 let text = "# Items\n\n- Item 1\n- Item 2\n- Item 3";
737 let elements = vec![
738 ContentElement::Heading {
739 level: 1,
740 text: "Items".to_string(),
741 byte_offset: 0,
742 },
743 ContentElement::List {
744 items: vec![
745 "Item 1".to_string(),
746 "Item 2".to_string(),
747 "Item 3".to_string(),
748 ],
749 ordered: false,
750 byte_offset: 9,
751 },
752 ];
753 let content = create_content_with_elements(text, elements);
754 let config = ChunkConfig::default();
755
756 let chunks = chunker.chunk(&content, &config).await.unwrap();
757 assert!(!chunks.is_empty());
758 let all_content: String = chunks.iter().map(|c| c.content.clone()).collect();
759 assert!(all_content.contains("Item 1"));
760 assert!(all_content.contains("Item 2"));
761 }
762
763 #[tokio::test]
764 async fn test_chunk_with_table_element() {
765 let chunker = SemanticChunker::new();
766 let text = "# Data\n\n| A | B |\n|---|---|\n| 1 | 2 |";
767 let elements = vec![
768 ContentElement::Heading {
769 level: 1,
770 text: "Data".to_string(),
771 byte_offset: 0,
772 },
773 ContentElement::Table {
774 headers: vec!["A".to_string(), "B".to_string()],
775 rows: vec![vec!["1".to_string(), "2".to_string()]],
776 byte_offset: 8,
777 },
778 ];
779 let content = create_content_with_elements(text, elements);
780 let config = ChunkConfig::default();
781
782 let chunks = chunker.chunk(&content, &config).await.unwrap();
783 assert!(!chunks.is_empty());
784 }
785
786 #[test]
787 fn test_parse_sections_with_underline_headings() {
788 let text = "Title\n=====\n\nContent under title.\n\nSubtitle\n--------\n\nMore content.";
789 let sections = parse_sections(text);
790
791 assert!(sections.len() >= 2);
792 }
793
794 #[test]
795 fn test_parse_sections_plain_text() {
796 let text = "Just some plain text\n\nwith multiple paragraphs\n\nbut no headings.";
797 let sections = parse_sections(text);
798
799 assert!(!sections.is_empty());
801 }
802
803 #[test]
804 fn test_parse_markdown_heading_levels() {
805 assert_eq!(parse_markdown_heading("# H1"), Some((1, "H1".to_string())));
806 assert_eq!(parse_markdown_heading("## H2"), Some((2, "H2".to_string())));
807 assert_eq!(
808 parse_markdown_heading("### H3"),
809 Some((3, "H3".to_string()))
810 );
811 assert_eq!(
812 parse_markdown_heading("#### H4"),
813 Some((4, "H4".to_string()))
814 );
815 assert_eq!(
816 parse_markdown_heading("##### H5"),
817 Some((5, "H5".to_string()))
818 );
819 assert_eq!(
820 parse_markdown_heading("###### H6"),
821 Some((6, "H6".to_string()))
822 );
823 assert_eq!(parse_markdown_heading("####### H7"), None);
825 }
826}