ragfs_extract/
text.rs

1//! Text content extractor.
2
3use async_trait::async_trait;
4use ragfs_core::{
5    ContentElement, ContentExtractor, ContentMetadataInfo, ExtractError, ExtractedContent,
6};
7use std::path::Path;
8use tokio::fs;
9
10/// Extractor for plain text files.
11pub struct TextExtractor;
12
13impl TextExtractor {
14    /// Create a new text extractor.
15    #[must_use]
16    pub fn new() -> Self {
17        Self
18    }
19}
20
21impl Default for TextExtractor {
22    fn default() -> Self {
23        Self::new()
24    }
25}
26
27#[async_trait]
28impl ContentExtractor for TextExtractor {
29    fn supported_types(&self) -> &[&str] {
30        &[
31            "text/plain",
32            "text/markdown",
33            "text/x-markdown",
34            "application/x-sh",
35            "text/x-rust",
36            "text/x-python",
37            "text/x-java",
38            "text/javascript",
39            "text/typescript",
40            "text/x-go",
41            "text/x-c",
42            "text/x-c++",
43            "application/json",
44            "application/xml",
45            "text/xml",
46            "text/html",
47            "text/css",
48            "application/toml",
49            "text/x-toml",
50            "application/yaml",
51            "text/x-yaml",
52        ]
53    }
54
55    fn can_extract_by_extension(&self, path: &Path) -> bool {
56        let extensions = [
57            "txt",
58            "md",
59            "markdown",
60            "rs",
61            "py",
62            "java",
63            "js",
64            "ts",
65            "tsx",
66            "jsx",
67            "go",
68            "c",
69            "cpp",
70            "cc",
71            "h",
72            "hpp",
73            "json",
74            "xml",
75            "html",
76            "htm",
77            "css",
78            "scss",
79            "sass",
80            "toml",
81            "yaml",
82            "yml",
83            "sh",
84            "bash",
85            "zsh",
86            "fish",
87            "sql",
88            "rb",
89            "php",
90            "swift",
91            "kt",
92            "kts",
93            "scala",
94            "clj",
95            "ex",
96            "exs",
97            "erl",
98            "hs",
99            "ml",
100            "mli",
101            "fs",
102            "fsx",
103            "lua",
104            "vim",
105            "el",
106            "lisp",
107            "scm",
108            "rkt",
109            "asm",
110            "s",
111            "dockerfile",
112            "makefile",
113            "cmake",
114            "gradle",
115            "sbt",
116            "cabal",
117            "nix",
118            "tf",
119            "hcl",
120        ];
121
122        path.extension()
123            .and_then(|ext| ext.to_str())
124            .is_some_and(|ext| extensions.contains(&ext.to_lowercase().as_str()))
125    }
126
127    async fn extract(&self, path: &Path) -> Result<ExtractedContent, ExtractError> {
128        let content = fs::read_to_string(path).await?;
129
130        // Detect language from extension
131        let language = path
132            .extension()
133            .and_then(|ext| ext.to_str())
134            .map(str::to_lowercase);
135
136        // Build elements (simple paragraph-based for now)
137        let elements = content
138            .split("\n\n")
139            .enumerate()
140            .filter(|(_, para)| !para.trim().is_empty())
141            .map(|(_idx, para)| ContentElement::Paragraph {
142                text: para.to_string(),
143                byte_offset: content[..content.find(para).unwrap_or(0)].len() as u64,
144            })
145            .collect();
146
147        Ok(ExtractedContent {
148            text: content,
149            elements,
150            images: vec![],
151            metadata: ContentMetadataInfo {
152                language,
153                ..Default::default()
154            },
155        })
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162    use tempfile::tempdir;
163
164    #[test]
165    fn test_new_extractor() {
166        let extractor = TextExtractor::new();
167        assert!(!extractor.supported_types().is_empty());
168    }
169
170    #[test]
171    fn test_default_implementation() {
172        let extractor = TextExtractor;
173        assert!(!extractor.supported_types().is_empty());
174    }
175
176    #[test]
177    fn test_supported_types_includes_common_types() {
178        let extractor = TextExtractor::new();
179        let types = extractor.supported_types();
180
181        assert!(types.contains(&"text/plain"));
182        assert!(types.contains(&"text/markdown"));
183        assert!(types.contains(&"text/x-rust"));
184        assert!(types.contains(&"text/x-python"));
185        assert!(types.contains(&"application/json"));
186        assert!(types.contains(&"text/javascript"));
187    }
188
189    #[test]
190    fn test_can_extract_by_extension_txt() {
191        let extractor = TextExtractor::new();
192        let path = std::path::PathBuf::from("/test/file.txt");
193        assert!(extractor.can_extract_by_extension(&path));
194    }
195
196    #[test]
197    fn test_can_extract_by_extension_rust() {
198        let extractor = TextExtractor::new();
199        let path = std::path::PathBuf::from("/test/main.rs");
200        assert!(extractor.can_extract_by_extension(&path));
201    }
202
203    #[test]
204    fn test_can_extract_by_extension_python() {
205        let extractor = TextExtractor::new();
206        let path = std::path::PathBuf::from("/test/script.py");
207        assert!(extractor.can_extract_by_extension(&path));
208    }
209
210    #[test]
211    fn test_can_extract_by_extension_markdown() {
212        let extractor = TextExtractor::new();
213        let path = std::path::PathBuf::from("/test/README.md");
214        assert!(extractor.can_extract_by_extension(&path));
215    }
216
217    #[test]
218    fn test_can_extract_by_extension_json() {
219        let extractor = TextExtractor::new();
220        let path = std::path::PathBuf::from("/test/config.json");
221        assert!(extractor.can_extract_by_extension(&path));
222    }
223
224    #[test]
225    fn test_can_extract_by_extension_typescript() {
226        let extractor = TextExtractor::new();
227        let path = std::path::PathBuf::from("/test/app.tsx");
228        assert!(extractor.can_extract_by_extension(&path));
229    }
230
231    #[test]
232    fn test_cannot_extract_binary() {
233        let extractor = TextExtractor::new();
234        let path = std::path::PathBuf::from("/test/image.png");
235        assert!(!extractor.can_extract_by_extension(&path));
236    }
237
238    #[test]
239    fn test_cannot_extract_executable() {
240        let extractor = TextExtractor::new();
241        let path = std::path::PathBuf::from("/test/program.exe");
242        assert!(!extractor.can_extract_by_extension(&path));
243    }
244
245    #[test]
246    fn test_cannot_extract_no_extension() {
247        let extractor = TextExtractor::new();
248        let path = std::path::PathBuf::from("/test/file_without_extension");
249        assert!(!extractor.can_extract_by_extension(&path));
250    }
251
252    #[test]
253    fn test_can_extract_case_insensitive() {
254        let extractor = TextExtractor::new();
255        let path = std::path::PathBuf::from("/test/FILE.TXT");
256        assert!(extractor.can_extract_by_extension(&path));
257    }
258
259    #[tokio::test]
260    async fn test_extract_simple_text() {
261        let temp_dir = tempdir().unwrap();
262        let file_path = temp_dir.path().join("test.txt");
263        std::fs::write(&file_path, "Hello, world!").unwrap();
264
265        let extractor = TextExtractor::new();
266        let result = extractor.extract(&file_path).await;
267
268        assert!(result.is_ok());
269        let content = result.unwrap();
270        assert_eq!(content.text, "Hello, world!");
271        assert!(content.images.is_empty());
272    }
273
274    #[tokio::test]
275    async fn test_extract_detects_language() {
276        let temp_dir = tempdir().unwrap();
277        let file_path = temp_dir.path().join("main.rs");
278        std::fs::write(&file_path, "fn main() {}").unwrap();
279
280        let extractor = TextExtractor::new();
281        let content = extractor.extract(&file_path).await.unwrap();
282
283        assert_eq!(content.metadata.language, Some("rs".to_string()));
284    }
285
286    #[tokio::test]
287    async fn test_extract_python_language() {
288        let temp_dir = tempdir().unwrap();
289        let file_path = temp_dir.path().join("script.py");
290        std::fs::write(&file_path, "print('hello')").unwrap();
291
292        let extractor = TextExtractor::new();
293        let content = extractor.extract(&file_path).await.unwrap();
294
295        assert_eq!(content.metadata.language, Some("py".to_string()));
296    }
297
298    #[tokio::test]
299    async fn test_extract_creates_paragraph_elements() {
300        let temp_dir = tempdir().unwrap();
301        let file_path = temp_dir.path().join("test.txt");
302        let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
303        std::fs::write(&file_path, text).unwrap();
304
305        let extractor = TextExtractor::new();
306        let content = extractor.extract(&file_path).await.unwrap();
307
308        assert_eq!(content.elements.len(), 3);
309    }
310
311    #[tokio::test]
312    async fn test_extract_handles_empty_file() {
313        let temp_dir = tempdir().unwrap();
314        let file_path = temp_dir.path().join("empty.txt");
315        std::fs::write(&file_path, "").unwrap();
316
317        let extractor = TextExtractor::new();
318        let content = extractor.extract(&file_path).await.unwrap();
319
320        assert_eq!(content.text, "");
321        assert!(content.elements.is_empty());
322    }
323
324    #[tokio::test]
325    async fn test_extract_handles_unicode() {
326        let temp_dir = tempdir().unwrap();
327        let file_path = temp_dir.path().join("unicode.txt");
328        let text = "Hello 世界! 🌍 Привет мир!";
329        std::fs::write(&file_path, text).unwrap();
330
331        let extractor = TextExtractor::new();
332        let content = extractor.extract(&file_path).await.unwrap();
333
334        assert_eq!(content.text, text);
335    }
336
337    #[tokio::test]
338    async fn test_extract_nonexistent_file_fails() {
339        let extractor = TextExtractor::new();
340        let result = extractor.extract(Path::new("/nonexistent/file.txt")).await;
341
342        assert!(result.is_err());
343    }
344
345    #[tokio::test]
346    async fn test_extract_multiline_content() {
347        let temp_dir = tempdir().unwrap();
348        let file_path = temp_dir.path().join("multi.txt");
349        let text = "Line 1\nLine 2\nLine 3";
350        std::fs::write(&file_path, text).unwrap();
351
352        let extractor = TextExtractor::new();
353        let content = extractor.extract(&file_path).await.unwrap();
354
355        assert_eq!(content.text, text);
356        // Single paragraph (no double newlines)
357        assert_eq!(content.elements.len(), 1);
358    }
359
360    #[test]
361    fn test_can_extract_config_files() {
362        let extractor = TextExtractor::new();
363
364        assert!(extractor.can_extract_by_extension(Path::new("config.toml")));
365        assert!(extractor.can_extract_by_extension(Path::new("config.yaml")));
366        assert!(extractor.can_extract_by_extension(Path::new("config.yml")));
367    }
368
369    #[test]
370    fn test_can_extract_shell_scripts() {
371        let extractor = TextExtractor::new();
372
373        assert!(extractor.can_extract_by_extension(Path::new("script.sh")));
374        assert!(extractor.can_extract_by_extension(Path::new("script.bash")));
375        assert!(extractor.can_extract_by_extension(Path::new("script.zsh")));
376    }
377
378    #[test]
379    fn test_can_extract_web_files() {
380        let extractor = TextExtractor::new();
381
382        assert!(extractor.can_extract_by_extension(Path::new("index.html")));
383        assert!(extractor.can_extract_by_extension(Path::new("styles.css")));
384        assert!(extractor.can_extract_by_extension(Path::new("app.js")));
385        assert!(extractor.can_extract_by_extension(Path::new("app.ts")));
386    }
387}