1use async_trait::async_trait;
4use ragfs_core::{
5 ContentElement, ContentExtractor, ContentMetadataInfo, ExtractError, ExtractedContent,
6};
7use std::path::Path;
8use tokio::fs;
9
10pub struct TextExtractor;
12
13impl TextExtractor {
14 #[must_use]
16 pub fn new() -> Self {
17 Self
18 }
19}
20
21impl Default for TextExtractor {
22 fn default() -> Self {
23 Self::new()
24 }
25}
26
27#[async_trait]
28impl ContentExtractor for TextExtractor {
29 fn supported_types(&self) -> &[&str] {
30 &[
31 "text/plain",
32 "text/markdown",
33 "text/x-markdown",
34 "application/x-sh",
35 "text/x-rust",
36 "text/x-python",
37 "text/x-java",
38 "text/javascript",
39 "text/typescript",
40 "text/x-go",
41 "text/x-c",
42 "text/x-c++",
43 "application/json",
44 "application/xml",
45 "text/xml",
46 "text/html",
47 "text/css",
48 "application/toml",
49 "text/x-toml",
50 "application/yaml",
51 "text/x-yaml",
52 ]
53 }
54
55 fn can_extract_by_extension(&self, path: &Path) -> bool {
56 let extensions = [
57 "txt",
58 "md",
59 "markdown",
60 "rs",
61 "py",
62 "java",
63 "js",
64 "ts",
65 "tsx",
66 "jsx",
67 "go",
68 "c",
69 "cpp",
70 "cc",
71 "h",
72 "hpp",
73 "json",
74 "xml",
75 "html",
76 "htm",
77 "css",
78 "scss",
79 "sass",
80 "toml",
81 "yaml",
82 "yml",
83 "sh",
84 "bash",
85 "zsh",
86 "fish",
87 "sql",
88 "rb",
89 "php",
90 "swift",
91 "kt",
92 "kts",
93 "scala",
94 "clj",
95 "ex",
96 "exs",
97 "erl",
98 "hs",
99 "ml",
100 "mli",
101 "fs",
102 "fsx",
103 "lua",
104 "vim",
105 "el",
106 "lisp",
107 "scm",
108 "rkt",
109 "asm",
110 "s",
111 "dockerfile",
112 "makefile",
113 "cmake",
114 "gradle",
115 "sbt",
116 "cabal",
117 "nix",
118 "tf",
119 "hcl",
120 ];
121
122 path.extension()
123 .and_then(|ext| ext.to_str())
124 .is_some_and(|ext| extensions.contains(&ext.to_lowercase().as_str()))
125 }
126
127 async fn extract(&self, path: &Path) -> Result<ExtractedContent, ExtractError> {
128 let content = fs::read_to_string(path).await?;
129
130 let language = path
132 .extension()
133 .and_then(|ext| ext.to_str())
134 .map(str::to_lowercase);
135
136 let elements = content
138 .split("\n\n")
139 .enumerate()
140 .filter(|(_, para)| !para.trim().is_empty())
141 .map(|(_idx, para)| ContentElement::Paragraph {
142 text: para.to_string(),
143 byte_offset: content[..content.find(para).unwrap_or(0)].len() as u64,
144 })
145 .collect();
146
147 Ok(ExtractedContent {
148 text: content,
149 elements,
150 images: vec![],
151 metadata: ContentMetadataInfo {
152 language,
153 ..Default::default()
154 },
155 })
156 }
157}
158
159#[cfg(test)]
160mod tests {
161 use super::*;
162 use tempfile::tempdir;
163
164 #[test]
165 fn test_new_extractor() {
166 let extractor = TextExtractor::new();
167 assert!(!extractor.supported_types().is_empty());
168 }
169
170 #[test]
171 fn test_default_implementation() {
172 let extractor = TextExtractor;
173 assert!(!extractor.supported_types().is_empty());
174 }
175
176 #[test]
177 fn test_supported_types_includes_common_types() {
178 let extractor = TextExtractor::new();
179 let types = extractor.supported_types();
180
181 assert!(types.contains(&"text/plain"));
182 assert!(types.contains(&"text/markdown"));
183 assert!(types.contains(&"text/x-rust"));
184 assert!(types.contains(&"text/x-python"));
185 assert!(types.contains(&"application/json"));
186 assert!(types.contains(&"text/javascript"));
187 }
188
189 #[test]
190 fn test_can_extract_by_extension_txt() {
191 let extractor = TextExtractor::new();
192 let path = std::path::PathBuf::from("/test/file.txt");
193 assert!(extractor.can_extract_by_extension(&path));
194 }
195
196 #[test]
197 fn test_can_extract_by_extension_rust() {
198 let extractor = TextExtractor::new();
199 let path = std::path::PathBuf::from("/test/main.rs");
200 assert!(extractor.can_extract_by_extension(&path));
201 }
202
203 #[test]
204 fn test_can_extract_by_extension_python() {
205 let extractor = TextExtractor::new();
206 let path = std::path::PathBuf::from("/test/script.py");
207 assert!(extractor.can_extract_by_extension(&path));
208 }
209
210 #[test]
211 fn test_can_extract_by_extension_markdown() {
212 let extractor = TextExtractor::new();
213 let path = std::path::PathBuf::from("/test/README.md");
214 assert!(extractor.can_extract_by_extension(&path));
215 }
216
217 #[test]
218 fn test_can_extract_by_extension_json() {
219 let extractor = TextExtractor::new();
220 let path = std::path::PathBuf::from("/test/config.json");
221 assert!(extractor.can_extract_by_extension(&path));
222 }
223
224 #[test]
225 fn test_can_extract_by_extension_typescript() {
226 let extractor = TextExtractor::new();
227 let path = std::path::PathBuf::from("/test/app.tsx");
228 assert!(extractor.can_extract_by_extension(&path));
229 }
230
231 #[test]
232 fn test_cannot_extract_binary() {
233 let extractor = TextExtractor::new();
234 let path = std::path::PathBuf::from("/test/image.png");
235 assert!(!extractor.can_extract_by_extension(&path));
236 }
237
238 #[test]
239 fn test_cannot_extract_executable() {
240 let extractor = TextExtractor::new();
241 let path = std::path::PathBuf::from("/test/program.exe");
242 assert!(!extractor.can_extract_by_extension(&path));
243 }
244
245 #[test]
246 fn test_cannot_extract_no_extension() {
247 let extractor = TextExtractor::new();
248 let path = std::path::PathBuf::from("/test/file_without_extension");
249 assert!(!extractor.can_extract_by_extension(&path));
250 }
251
252 #[test]
253 fn test_can_extract_case_insensitive() {
254 let extractor = TextExtractor::new();
255 let path = std::path::PathBuf::from("/test/FILE.TXT");
256 assert!(extractor.can_extract_by_extension(&path));
257 }
258
259 #[tokio::test]
260 async fn test_extract_simple_text() {
261 let temp_dir = tempdir().unwrap();
262 let file_path = temp_dir.path().join("test.txt");
263 std::fs::write(&file_path, "Hello, world!").unwrap();
264
265 let extractor = TextExtractor::new();
266 let result = extractor.extract(&file_path).await;
267
268 assert!(result.is_ok());
269 let content = result.unwrap();
270 assert_eq!(content.text, "Hello, world!");
271 assert!(content.images.is_empty());
272 }
273
274 #[tokio::test]
275 async fn test_extract_detects_language() {
276 let temp_dir = tempdir().unwrap();
277 let file_path = temp_dir.path().join("main.rs");
278 std::fs::write(&file_path, "fn main() {}").unwrap();
279
280 let extractor = TextExtractor::new();
281 let content = extractor.extract(&file_path).await.unwrap();
282
283 assert_eq!(content.metadata.language, Some("rs".to_string()));
284 }
285
286 #[tokio::test]
287 async fn test_extract_python_language() {
288 let temp_dir = tempdir().unwrap();
289 let file_path = temp_dir.path().join("script.py");
290 std::fs::write(&file_path, "print('hello')").unwrap();
291
292 let extractor = TextExtractor::new();
293 let content = extractor.extract(&file_path).await.unwrap();
294
295 assert_eq!(content.metadata.language, Some("py".to_string()));
296 }
297
298 #[tokio::test]
299 async fn test_extract_creates_paragraph_elements() {
300 let temp_dir = tempdir().unwrap();
301 let file_path = temp_dir.path().join("test.txt");
302 let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
303 std::fs::write(&file_path, text).unwrap();
304
305 let extractor = TextExtractor::new();
306 let content = extractor.extract(&file_path).await.unwrap();
307
308 assert_eq!(content.elements.len(), 3);
309 }
310
311 #[tokio::test]
312 async fn test_extract_handles_empty_file() {
313 let temp_dir = tempdir().unwrap();
314 let file_path = temp_dir.path().join("empty.txt");
315 std::fs::write(&file_path, "").unwrap();
316
317 let extractor = TextExtractor::new();
318 let content = extractor.extract(&file_path).await.unwrap();
319
320 assert_eq!(content.text, "");
321 assert!(content.elements.is_empty());
322 }
323
324 #[tokio::test]
325 async fn test_extract_handles_unicode() {
326 let temp_dir = tempdir().unwrap();
327 let file_path = temp_dir.path().join("unicode.txt");
328 let text = "Hello 世界! 🌍 Привет мир!";
329 std::fs::write(&file_path, text).unwrap();
330
331 let extractor = TextExtractor::new();
332 let content = extractor.extract(&file_path).await.unwrap();
333
334 assert_eq!(content.text, text);
335 }
336
337 #[tokio::test]
338 async fn test_extract_nonexistent_file_fails() {
339 let extractor = TextExtractor::new();
340 let result = extractor.extract(Path::new("/nonexistent/file.txt")).await;
341
342 assert!(result.is_err());
343 }
344
345 #[tokio::test]
346 async fn test_extract_multiline_content() {
347 let temp_dir = tempdir().unwrap();
348 let file_path = temp_dir.path().join("multi.txt");
349 let text = "Line 1\nLine 2\nLine 3";
350 std::fs::write(&file_path, text).unwrap();
351
352 let extractor = TextExtractor::new();
353 let content = extractor.extract(&file_path).await.unwrap();
354
355 assert_eq!(content.text, text);
356 assert_eq!(content.elements.len(), 1);
358 }
359
360 #[test]
361 fn test_can_extract_config_files() {
362 let extractor = TextExtractor::new();
363
364 assert!(extractor.can_extract_by_extension(Path::new("config.toml")));
365 assert!(extractor.can_extract_by_extension(Path::new("config.yaml")));
366 assert!(extractor.can_extract_by_extension(Path::new("config.yml")));
367 }
368
369 #[test]
370 fn test_can_extract_shell_scripts() {
371 let extractor = TextExtractor::new();
372
373 assert!(extractor.can_extract_by_extension(Path::new("script.sh")));
374 assert!(extractor.can_extract_by_extension(Path::new("script.bash")));
375 assert!(extractor.can_extract_by_extension(Path::new("script.zsh")));
376 }
377
378 #[test]
379 fn test_can_extract_web_files() {
380 let extractor = TextExtractor::new();
381
382 assert!(extractor.can_extract_by_extension(Path::new("index.html")));
383 assert!(extractor.can_extract_by_extension(Path::new("styles.css")));
384 assert!(extractor.can_extract_by_extension(Path::new("app.js")));
385 assert!(extractor.can_extract_by_extension(Path::new("app.ts")));
386 }
387}