ragfs_chunker/
semantic.rs

1//! Semantic chunking strategy.
2//!
3//! Chunks content based on semantic structure (headings, paragraphs, sections).
4//! Best for markdown, documentation, and prose text.
5
6use async_trait::async_trait;
7use ragfs_core::{
8    ChunkConfig, ChunkError, ChunkOutput, ChunkOutputMetadata, Chunker, ContentElement,
9    ContentType, ExtractedContent,
10};
11use tracing::debug;
12
13/// Semantic chunker that splits at document structure boundaries.
14pub struct SemanticChunker;
15
16impl SemanticChunker {
17    /// Create a new semantic chunker.
18    #[must_use]
19    pub fn new() -> Self {
20        Self
21    }
22}
23
24impl Default for SemanticChunker {
25    fn default() -> Self {
26        Self::new()
27    }
28}
29
30#[async_trait]
31impl Chunker for SemanticChunker {
32    fn name(&self) -> &'static str {
33        "semantic"
34    }
35
36    fn content_types(&self) -> &[&str] {
37        &["text", "markdown"]
38    }
39
40    fn can_chunk(&self, content_type: &ContentType) -> bool {
41        matches!(content_type, ContentType::Text | ContentType::Markdown)
42    }
43
44    async fn chunk(
45        &self,
46        content: &ExtractedContent,
47        config: &ChunkConfig,
48    ) -> Result<Vec<ChunkOutput>, ChunkError> {
49        let text = &content.text;
50        if text.is_empty() {
51            return Ok(vec![]);
52        }
53
54        debug!("Semantic chunking {} bytes", text.len());
55
56        // If we have structured elements, use them
57        if !content.elements.is_empty() {
58            return chunk_from_elements(text, &content.elements, config);
59        }
60
61        // Otherwise, parse structure from text
62        let sections = parse_sections(text);
63        chunk_sections(text, &sections, config)
64    }
65}
66
67/// A parsed section of the document.
68#[derive(Debug)]
69struct Section {
70    heading: Option<String>,
71    heading_level: u8,
72    start_byte: usize,
73    end_byte: usize,
74    content: String,
75}
76
77/// Parse document into sections based on headings.
78fn parse_sections(text: &str) -> Vec<Section> {
79    let mut sections = Vec::new();
80    let mut current_section = Section {
81        heading: None,
82        heading_level: 0,
83        start_byte: 0,
84        end_byte: 0,
85        content: String::new(),
86    };
87
88    let lines: Vec<&str> = text.lines().collect();
89    let mut byte_offset = 0;
90
91    for (i, line) in lines.iter().enumerate() {
92        // Check for markdown-style headings
93        if let Some((level, heading_text)) = parse_markdown_heading(line) {
94            // Save current section if it has content
95            if !current_section.content.trim().is_empty() || current_section.heading.is_some() {
96                current_section.end_byte = byte_offset;
97                sections.push(current_section);
98            }
99
100            // Start new section
101            current_section = Section {
102                heading: Some(heading_text),
103                heading_level: level,
104                start_byte: byte_offset,
105                end_byte: 0,
106                content: String::new(),
107            };
108        }
109        // Check for underline-style headings (=== or ---)
110        else if i > 0 && is_underline_heading(line, &lines[..i]) {
111            let prev_line = lines[i - 1];
112            let level = if line.starts_with('=') { 1 } else { 2 };
113
114            // Update the previous section to end before the heading
115            if !sections.is_empty() {
116                let last = sections.last_mut().unwrap();
117                // Remove the heading text from previous section
118                if last.content.ends_with(prev_line) {
119                    last.content = last.content[..last.content.len() - prev_line.len()]
120                        .trim_end()
121                        .to_string();
122                    last.end_byte = byte_offset - prev_line.len() - 1;
123                }
124            } else if current_section.content.ends_with(prev_line) {
125                current_section.content = current_section.content
126                    [..current_section.content.len() - prev_line.len()]
127                    .trim_end()
128                    .to_string();
129            }
130
131            // Save current section
132            if !current_section.content.trim().is_empty() || current_section.heading.is_some() {
133                current_section.end_byte = byte_offset - prev_line.len() - 1;
134                sections.push(current_section);
135            }
136
137            // Start new section
138            current_section = Section {
139                heading: Some(prev_line.to_string()),
140                heading_level: level,
141                start_byte: byte_offset - prev_line.len() - 1,
142                end_byte: 0,
143                content: String::new(),
144            };
145        } else {
146            // Add to current section
147            if !current_section.content.is_empty() {
148                current_section.content.push('\n');
149            }
150            current_section.content.push_str(line);
151        }
152
153        byte_offset += line.len() + 1; // +1 for newline
154    }
155
156    // Save final section
157    current_section.end_byte = text.len();
158    if !current_section.content.trim().is_empty() || current_section.heading.is_some() {
159        sections.push(current_section);
160    }
161
162    sections
163}
164
165/// Parse a markdown-style heading (# Heading).
166fn parse_markdown_heading(line: &str) -> Option<(u8, String)> {
167    let trimmed = line.trim_start();
168    if !trimmed.starts_with('#') {
169        return None;
170    }
171
172    let hash_count = trimmed.chars().take_while(|c| *c == '#').count();
173    if hash_count > 6 {
174        return None; // Max 6 levels in markdown
175    }
176
177    let rest = &trimmed[hash_count..];
178    if rest.is_empty() || !rest.starts_with(char::is_whitespace) {
179        return None;
180    }
181
182    Some((hash_count as u8, rest.trim().to_string()))
183}
184
185/// Check if a line is an underline-style heading indicator.
186fn is_underline_heading(line: &str, previous_lines: &[&str]) -> bool {
187    let trimmed = line.trim();
188    if trimmed.is_empty() {
189        return false;
190    }
191
192    // Must be at least 3 characters
193    if trimmed.len() < 3 {
194        return false;
195    }
196
197    // Must be all = or all -
198    let is_equals = trimmed.chars().all(|c| c == '=');
199    let is_dashes = trimmed.chars().all(|c| c == '-');
200
201    if !is_equals && !is_dashes {
202        return false;
203    }
204
205    // Previous line must exist and not be empty
206    if previous_lines.is_empty() {
207        return false;
208    }
209
210    let prev = previous_lines.last().unwrap().trim();
211    !prev.is_empty() && !prev.starts_with('#')
212}
213
214/// Chunk sections into appropriately-sized chunks.
215fn chunk_sections(
216    text: &str,
217    sections: &[Section],
218    config: &ChunkConfig,
219) -> Result<Vec<ChunkOutput>, ChunkError> {
220    let mut chunks = Vec::new();
221    let chars_per_token = 4;
222    let target_chars = config.target_size * chars_per_token;
223    let max_chars = config.max_size * chars_per_token;
224
225    let mut current_chunk = String::new();
226    let mut chunk_start = 0;
227    let mut current_heading: Option<String> = None;
228
229    for section in sections {
230        let section_text = if let Some(ref heading) = section.heading {
231            format!(
232                "{} {}\n\n{}",
233                "#".repeat(section.heading_level as usize),
234                heading,
235                section.content.trim()
236            )
237        } else {
238            section.content.trim().to_string()
239        };
240
241        // If adding this section would exceed max, flush current chunk
242        if !current_chunk.is_empty() && current_chunk.len() + section_text.len() > max_chars {
243            chunks.push(create_chunk(
244                text,
245                &current_chunk,
246                chunk_start,
247                current_heading.take(),
248            ));
249            current_chunk = String::new();
250            chunk_start = section.start_byte;
251        }
252
253        // If section itself is too large, split it
254        if section_text.len() > max_chars {
255            // Flush current chunk first
256            if !current_chunk.is_empty() {
257                chunks.push(create_chunk(
258                    text,
259                    &current_chunk,
260                    chunk_start,
261                    current_heading.take(),
262                ));
263                current_chunk = String::new();
264            }
265
266            // Split large section by paragraphs
267            let sub_chunks = split_large_section(&section_text, section.start_byte, config)?;
268            for mut sub_chunk in sub_chunks {
269                // Add section heading to first sub-chunk's metadata
270                if chunks.is_empty() || sub_chunk.metadata.symbol_name.is_none() {
271                    sub_chunk.metadata.symbol_name = section.heading.clone();
272                }
273                chunks.push(sub_chunk);
274            }
275            chunk_start = section.end_byte;
276        } else {
277            // Add section to current chunk
278            if !current_chunk.is_empty() {
279                current_chunk.push_str("\n\n");
280            }
281            current_chunk.push_str(&section_text);
282
283            // Update heading if this section has one
284            if section.heading.is_some() {
285                current_heading = section.heading.clone();
286            }
287
288            // If we've reached target size, consider flushing
289            if current_chunk.len() >= target_chars {
290                chunks.push(create_chunk(
291                    text,
292                    &current_chunk,
293                    chunk_start,
294                    current_heading.take(),
295                ));
296                current_chunk = String::new();
297                chunk_start = section.end_byte;
298            }
299        }
300    }
301
302    // Flush remaining content
303    if !current_chunk.is_empty() {
304        chunks.push(create_chunk(
305            text,
306            &current_chunk,
307            chunk_start,
308            current_heading,
309        ));
310    }
311
312    Ok(chunks)
313}
314
315/// Create a chunk output.
316fn create_chunk(
317    _text: &str,
318    content: &str,
319    start_byte: usize,
320    heading: Option<String>,
321) -> ChunkOutput {
322    let line_count = content.matches('\n').count() as u32;
323
324    ChunkOutput {
325        content: content.to_string(),
326        byte_range: start_byte as u64..(start_byte + content.len()) as u64,
327        line_range: Some(0..line_count),
328        parent_index: None,
329        depth: 0,
330        metadata: ChunkOutputMetadata {
331            symbol_type: heading.as_ref().map(|_| "section".to_string()),
332            symbol_name: heading,
333            language: None,
334        },
335    }
336}
337
338/// Split a large section into smaller chunks.
339fn split_large_section(
340    text: &str,
341    base_offset: usize,
342    config: &ChunkConfig,
343) -> Result<Vec<ChunkOutput>, ChunkError> {
344    let mut chunks = Vec::new();
345    let chars_per_token = 4;
346    let target_chars = config.target_size * chars_per_token;
347    let overlap_chars = config.overlap * chars_per_token;
348
349    let paragraphs: Vec<&str> = text.split("\n\n").collect();
350    let mut current = String::new();
351    let mut current_offset = base_offset;
352
353    for para in paragraphs {
354        let para = para.trim();
355        if para.is_empty() {
356            continue;
357        }
358
359        // If adding this paragraph exceeds target, flush
360        if !current.is_empty() && current.len() + para.len() > target_chars {
361            let line_count = current.matches('\n').count() as u32;
362            chunks.push(ChunkOutput {
363                content: current.clone(),
364                byte_range: current_offset as u64..(current_offset + current.len()) as u64,
365                line_range: Some(0..line_count),
366                parent_index: None,
367                depth: 0,
368                metadata: ChunkOutputMetadata::default(),
369            });
370
371            // Keep overlap
372            let overlap_start = current.len().saturating_sub(overlap_chars);
373            let overlap = &current[overlap_start..];
374            current = format!("{overlap}\n\n{para}");
375            current_offset += overlap_start;
376        } else {
377            if !current.is_empty() {
378                current.push_str("\n\n");
379            }
380            current.push_str(para);
381        }
382    }
383
384    // Flush remaining
385    if !current.is_empty() {
386        let line_count = current.matches('\n').count() as u32;
387        chunks.push(ChunkOutput {
388            content: current.clone(),
389            byte_range: current_offset as u64..(current_offset + current.len()) as u64,
390            line_range: Some(0..line_count),
391            parent_index: None,
392            depth: 0,
393            metadata: ChunkOutputMetadata::default(),
394        });
395    }
396
397    Ok(chunks)
398}
399
400/// Chunk from structured elements.
401fn chunk_from_elements(
402    text: &str,
403    elements: &[ContentElement],
404    config: &ChunkConfig,
405) -> Result<Vec<ChunkOutput>, ChunkError> {
406    let mut chunks = Vec::new();
407    let chars_per_token = 4;
408    let target_chars = config.target_size * chars_per_token;
409    let max_chars = config.max_size * chars_per_token;
410
411    let mut current_chunk = String::new();
412    let mut chunk_start = 0u64;
413    let mut current_heading: Option<String> = None;
414
415    for element in elements {
416        let (elem_text, elem_offset, is_heading) = match element {
417            ContentElement::Heading {
418                level,
419                text,
420                byte_offset,
421            } => {
422                let heading_text = format!("{} {}", "#".repeat(*level as usize), text);
423                (heading_text, *byte_offset, true)
424            }
425            ContentElement::Paragraph { text, byte_offset } => (text.clone(), *byte_offset, false),
426            ContentElement::CodeBlock {
427                language,
428                code,
429                byte_offset,
430            } => {
431                let lang = language.as_deref().unwrap_or("");
432                let block = format!("```{lang}\n{code}\n```");
433                (block, *byte_offset, false)
434            }
435            ContentElement::List {
436                items,
437                ordered,
438                byte_offset,
439            } => {
440                let list_text = items
441                    .iter()
442                    .enumerate()
443                    .map(|(i, item)| {
444                        if *ordered {
445                            format!("{}. {}", i + 1, item)
446                        } else {
447                            format!("- {item}")
448                        }
449                    })
450                    .collect::<Vec<_>>()
451                    .join("\n");
452                (list_text, *byte_offset, false)
453            }
454            ContentElement::Table {
455                headers,
456                rows,
457                byte_offset,
458            } => {
459                let mut table = String::new();
460                table.push_str(&format!("| {} |\n", headers.join(" | ")));
461                table.push_str(&format!(
462                    "| {} |\n",
463                    headers
464                        .iter()
465                        .map(|_| "---")
466                        .collect::<Vec<_>>()
467                        .join(" | ")
468                ));
469                for row in rows {
470                    table.push_str(&format!("| {} |\n", row.join(" | ")));
471                }
472                (table, *byte_offset, false)
473            }
474        };
475
476        // If adding this element would exceed max, flush current chunk
477        if !current_chunk.is_empty() && current_chunk.len() + elem_text.len() + 2 > max_chars {
478            chunks.push(create_chunk(
479                text,
480                &current_chunk,
481                chunk_start as usize,
482                current_heading.take(),
483            ));
484            current_chunk = String::new();
485            chunk_start = elem_offset;
486        }
487
488        // Add element to current chunk
489        if !current_chunk.is_empty() {
490            current_chunk.push_str("\n\n");
491        }
492        current_chunk.push_str(&elem_text);
493
494        if is_heading {
495            current_heading = Some(elem_text.trim_start_matches('#').trim().to_string());
496        }
497
498        // If we've reached target size, flush
499        if current_chunk.len() >= target_chars {
500            chunks.push(create_chunk(
501                text,
502                &current_chunk,
503                chunk_start as usize,
504                current_heading.take(),
505            ));
506            current_chunk = String::new();
507            chunk_start = elem_offset + elem_text.len() as u64;
508        }
509    }
510
511    // Flush remaining
512    if !current_chunk.is_empty() {
513        chunks.push(create_chunk(
514            text,
515            &current_chunk,
516            chunk_start as usize,
517            current_heading,
518        ));
519    }
520
521    Ok(chunks)
522}
523
524#[cfg(test)]
525mod tests {
526    use super::*;
527    use ragfs_core::ContentMetadataInfo;
528
529    fn create_test_content(text: &str) -> ExtractedContent {
530        ExtractedContent {
531            text: text.to_string(),
532            elements: vec![],
533            images: vec![],
534            metadata: ContentMetadataInfo::default(),
535        }
536    }
537
538    fn create_content_with_elements(text: &str, elements: Vec<ContentElement>) -> ExtractedContent {
539        ExtractedContent {
540            text: text.to_string(),
541            elements,
542            images: vec![],
543            metadata: ContentMetadataInfo::default(),
544        }
545    }
546
547    #[test]
548    fn test_parse_markdown_heading() {
549        assert_eq!(
550            parse_markdown_heading("# Title"),
551            Some((1, "Title".to_string()))
552        );
553        assert_eq!(
554            parse_markdown_heading("## Subtitle"),
555            Some((2, "Subtitle".to_string()))
556        );
557        assert_eq!(
558            parse_markdown_heading("### Section"),
559            Some((3, "Section".to_string()))
560        );
561        assert_eq!(parse_markdown_heading("Not a heading"), None);
562        assert_eq!(parse_markdown_heading("#NoSpace"), None);
563    }
564
565    #[test]
566    fn test_is_underline_heading() {
567        assert!(is_underline_heading("===", &["Title"]));
568        assert!(is_underline_heading("---", &["Subtitle"]));
569        assert!(!is_underline_heading("===", &[""]));
570        assert!(!is_underline_heading("---", &[]));
571    }
572
573    #[test]
574    fn test_parse_sections() {
575        let text = "# Introduction\n\nThis is the intro.\n\n## Details\n\nMore details here.";
576        let sections = parse_sections(text);
577
578        assert_eq!(sections.len(), 2);
579        assert_eq!(sections[0].heading, Some("Introduction".to_string()));
580        assert_eq!(sections[1].heading, Some("Details".to_string()));
581    }
582
583    #[test]
584    fn test_semantic_chunker_can_chunk() {
585        let chunker = SemanticChunker::new();
586        assert!(chunker.can_chunk(&ContentType::Text));
587        assert!(chunker.can_chunk(&ContentType::Markdown));
588        assert!(!chunker.can_chunk(&ContentType::Code {
589            language: "rust".to_string(),
590            symbol: None
591        }));
592    }
593
594    #[test]
595    fn test_chunker_name() {
596        let chunker = SemanticChunker::new();
597        assert_eq!(chunker.name(), "semantic");
598    }
599
600    #[test]
601    fn test_chunker_content_types() {
602        let chunker = SemanticChunker::new();
603        let types = chunker.content_types();
604        assert!(types.contains(&"text"));
605        assert!(types.contains(&"markdown"));
606    }
607
608    #[test]
609    fn test_default_implementation() {
610        let chunker = SemanticChunker;
611        assert_eq!(chunker.name(), "semantic");
612    }
613
614    #[tokio::test]
615    async fn test_chunk_empty_text() {
616        let chunker = SemanticChunker::new();
617        let content = create_test_content("");
618        let config = ChunkConfig::default();
619
620        let chunks = chunker.chunk(&content, &config).await.unwrap();
621        assert!(chunks.is_empty());
622    }
623
624    #[tokio::test]
625    async fn test_chunk_simple_text() {
626        let chunker = SemanticChunker::new();
627        let content = create_test_content("This is simple text without headings.");
628        let config = ChunkConfig::default();
629
630        let chunks = chunker.chunk(&content, &config).await.unwrap();
631        assert_eq!(chunks.len(), 1);
632        assert!(chunks[0].content.contains("simple text"));
633    }
634
635    #[tokio::test]
636    async fn test_chunk_markdown_with_headings() {
637        let chunker = SemanticChunker::new();
638        let text = "# Introduction\n\nThis is the intro.\n\n## Details\n\nMore details here.";
639        let content = create_test_content(text);
640        let config = ChunkConfig::default();
641
642        let chunks = chunker.chunk(&content, &config).await.unwrap();
643        assert!(!chunks.is_empty());
644        // Check that headings are preserved in output
645        let all_content: String = chunks.iter().map(|c| c.content.clone()).collect();
646        assert!(all_content.contains("Introduction"));
647        assert!(all_content.contains("Details"));
648    }
649
650    #[tokio::test]
651    async fn test_chunk_with_structured_elements() {
652        let chunker = SemanticChunker::new();
653        let text = "# Title\n\nParagraph content.";
654        let elements = vec![
655            ContentElement::Heading {
656                level: 1,
657                text: "Title".to_string(),
658                byte_offset: 0,
659            },
660            ContentElement::Paragraph {
661                text: "Paragraph content.".to_string(),
662                byte_offset: 9,
663            },
664        ];
665        let content = create_content_with_elements(text, elements);
666        let config = ChunkConfig::default();
667
668        let chunks = chunker.chunk(&content, &config).await.unwrap();
669        assert!(!chunks.is_empty());
670    }
671
672    #[tokio::test]
673    async fn test_chunk_large_text_splits() {
674        let chunker = SemanticChunker::new();
675        // Create text with multiple sections that exceed target size
676        let section = "# Section\n\nThis is a paragraph with some content.\n\n";
677        let text = section.repeat(50);
678        let content = create_test_content(&text);
679        let config = ChunkConfig {
680            target_size: 100,
681            max_size: 200,
682            overlap: 20,
683            ..Default::default()
684        };
685
686        let chunks = chunker.chunk(&content, &config).await.unwrap();
687        assert!(
688            chunks.len() > 1,
689            "Large text should produce multiple chunks"
690        );
691    }
692
693    #[tokio::test]
694    async fn test_chunk_preserves_heading_metadata() {
695        let chunker = SemanticChunker::new();
696        let text = "# Important Section\n\nContent under the section.";
697        let content = create_test_content(text);
698        let config = ChunkConfig::default();
699
700        let chunks = chunker.chunk(&content, &config).await.unwrap();
701        assert!(!chunks.is_empty());
702        // First chunk should have section metadata
703        if let Some(symbol_name) = &chunks[0].metadata.symbol_name {
704            assert!(symbol_name.contains("Important Section"));
705        }
706    }
707
708    #[tokio::test]
709    async fn test_chunk_with_code_block_element() {
710        let chunker = SemanticChunker::new();
711        let text = "# Code Example\n\n```rust\nfn main() {}\n```";
712        let elements = vec![
713            ContentElement::Heading {
714                level: 1,
715                text: "Code Example".to_string(),
716                byte_offset: 0,
717            },
718            ContentElement::CodeBlock {
719                language: Some("rust".to_string()),
720                code: "fn main() {}".to_string(),
721                byte_offset: 16,
722            },
723        ];
724        let content = create_content_with_elements(text, elements);
725        let config = ChunkConfig::default();
726
727        let chunks = chunker.chunk(&content, &config).await.unwrap();
728        assert!(!chunks.is_empty());
729        let all_content: String = chunks.iter().map(|c| c.content.clone()).collect();
730        assert!(all_content.contains("fn main()"));
731    }
732
733    #[tokio::test]
734    async fn test_chunk_with_list_element() {
735        let chunker = SemanticChunker::new();
736        let text = "# Items\n\n- Item 1\n- Item 2\n- Item 3";
737        let elements = vec![
738            ContentElement::Heading {
739                level: 1,
740                text: "Items".to_string(),
741                byte_offset: 0,
742            },
743            ContentElement::List {
744                items: vec![
745                    "Item 1".to_string(),
746                    "Item 2".to_string(),
747                    "Item 3".to_string(),
748                ],
749                ordered: false,
750                byte_offset: 9,
751            },
752        ];
753        let content = create_content_with_elements(text, elements);
754        let config = ChunkConfig::default();
755
756        let chunks = chunker.chunk(&content, &config).await.unwrap();
757        assert!(!chunks.is_empty());
758        let all_content: String = chunks.iter().map(|c| c.content.clone()).collect();
759        assert!(all_content.contains("Item 1"));
760        assert!(all_content.contains("Item 2"));
761    }
762
763    #[tokio::test]
764    async fn test_chunk_with_table_element() {
765        let chunker = SemanticChunker::new();
766        let text = "# Data\n\n| A | B |\n|---|---|\n| 1 | 2 |";
767        let elements = vec![
768            ContentElement::Heading {
769                level: 1,
770                text: "Data".to_string(),
771                byte_offset: 0,
772            },
773            ContentElement::Table {
774                headers: vec!["A".to_string(), "B".to_string()],
775                rows: vec![vec!["1".to_string(), "2".to_string()]],
776                byte_offset: 8,
777            },
778        ];
779        let content = create_content_with_elements(text, elements);
780        let config = ChunkConfig::default();
781
782        let chunks = chunker.chunk(&content, &config).await.unwrap();
783        assert!(!chunks.is_empty());
784    }
785
786    #[test]
787    fn test_parse_sections_with_underline_headings() {
788        let text = "Title\n=====\n\nContent under title.\n\nSubtitle\n--------\n\nMore content.";
789        let sections = parse_sections(text);
790
791        assert!(sections.len() >= 2);
792    }
793
794    #[test]
795    fn test_parse_sections_plain_text() {
796        let text = "Just some plain text\n\nwith multiple paragraphs\n\nbut no headings.";
797        let sections = parse_sections(text);
798
799        // Should still create at least one section
800        assert!(!sections.is_empty());
801    }
802
803    #[test]
804    fn test_parse_markdown_heading_levels() {
805        assert_eq!(parse_markdown_heading("# H1"), Some((1, "H1".to_string())));
806        assert_eq!(parse_markdown_heading("## H2"), Some((2, "H2".to_string())));
807        assert_eq!(
808            parse_markdown_heading("### H3"),
809            Some((3, "H3".to_string()))
810        );
811        assert_eq!(
812            parse_markdown_heading("#### H4"),
813            Some((4, "H4".to_string()))
814        );
815        assert_eq!(
816            parse_markdown_heading("##### H5"),
817            Some((5, "H5".to_string()))
818        );
819        assert_eq!(
820            parse_markdown_heading("###### H6"),
821            Some((6, "H6".to_string()))
822        );
823        // 7 hashes is not valid
824        assert_eq!(parse_markdown_heading("####### H7"), None);
825    }
826}
ragfs_chunker/semantic.rs

ragfs_chunker/
semantic.rs