1use async_trait::async_trait;
6use flate2::read::ZlibDecoder;
7use lopdf::Document;
8use ragfs_core::{
9 ContentElement, ContentExtractor, ContentMetadataInfo, ExtractError, ExtractedContent,
10 ExtractedImage,
11};
12use std::io::Read;
13use std::path::Path;
14use tracing::{debug, warn};
15
16pub struct PdfExtractor;
18
19impl PdfExtractor {
20 #[must_use]
22 pub fn new() -> Self {
23 Self
24 }
25}
26
27impl Default for PdfExtractor {
28 fn default() -> Self {
29 Self::new()
30 }
31}
32
33#[async_trait]
34impl ContentExtractor for PdfExtractor {
35 fn supported_types(&self) -> &[&str] {
36 &["application/pdf"]
37 }
38
39 fn can_extract_by_extension(&self, path: &Path) -> bool {
40 path.extension()
41 .and_then(|ext| ext.to_str())
42 .is_some_and(|ext| ext.eq_ignore_ascii_case("pdf"))
43 }
44
45 async fn extract(&self, path: &Path) -> Result<ExtractedContent, ExtractError> {
46 debug!("Extracting PDF: {:?}", path);
47
48 let bytes = tokio::fs::read(path).await?;
50
51 let text = tokio::task::spawn_blocking({
53 let bytes = bytes.clone();
54 move || extract_pdf_text(&bytes)
55 })
56 .await
57 .map_err(|e| ExtractError::Failed(format!("Task join error: {e}")))?
58 .map_err(|e| ExtractError::Failed(format!("PDF extraction failed: {e}")))?;
59
60 let images = tokio::task::spawn_blocking(move || extract_pdf_images(&bytes))
62 .await
63 .map_err(|e| ExtractError::Failed(format!("Image extraction task error: {e}")))?;
64
65 let elements = build_elements(&text);
67
68 let page_count = estimate_page_count(&text);
70
71 Ok(ExtractedContent {
72 text,
73 elements,
74 images,
75 metadata: ContentMetadataInfo {
76 page_count: Some(page_count),
77 ..Default::default()
78 },
79 })
80 }
81}
82
83fn extract_pdf_text(bytes: &[u8]) -> Result<String, String> {
85 pdf_extract::extract_text_from_mem(bytes).map_err(|e| e.to_string())
86}
87
88const MAX_IMAGES: usize = 100;
90const MAX_TOTAL_BYTES: usize = 50 * 1024 * 1024; const MIN_DIMENSION: u32 = 50; fn extract_pdf_images(bytes: &[u8]) -> Vec<ExtractedImage> {
95 let doc = match Document::load_mem(bytes) {
96 Ok(d) => d,
97 Err(e) => {
98 warn!("Failed to load PDF for image extraction: {}", e);
99 return vec![];
100 }
101 };
102
103 let mut images = Vec::new();
104 let mut total_bytes = 0usize;
105
106 let pages = doc.get_pages();
107 for (page_num, page_id) in pages {
108 if images.len() >= MAX_IMAGES {
109 debug!(
110 "Reached maximum image count ({}), stopping extraction",
111 MAX_IMAGES
112 );
113 break;
114 }
115
116 match doc.get_page_images(page_id) {
117 Ok(page_images) => {
118 for pdf_image in page_images {
119 if images.len() >= MAX_IMAGES || total_bytes >= MAX_TOTAL_BYTES {
120 break;
121 }
122
123 if pdf_image.width < i64::from(MIN_DIMENSION)
125 || pdf_image.height < i64::from(MIN_DIMENSION)
126 {
127 debug!(
128 "Skipping small image: {}x{}",
129 pdf_image.width, pdf_image.height
130 );
131 continue;
132 }
133
134 if let Some(extracted) = decode_pdf_image(&pdf_image, page_num) {
135 total_bytes += extracted.data.len();
136 images.push(extracted);
137 }
138 }
139 }
140 Err(e) => {
141 debug!("Failed to get images from page {}: {}", page_num, e);
142 }
143 }
144 }
145
146 debug!(
147 "Extracted {} images from PDF ({} bytes total)",
148 images.len(),
149 total_bytes
150 );
151 images
152}
153
154fn decode_pdf_image(pdf_image: &lopdf::xobject::PdfImage, page_num: u32) -> Option<ExtractedImage> {
156 let filters = pdf_image.filters.as_ref()?;
157
158 let (data, mime_type) = if filters.iter().any(|f| f == "DCTDecode") {
160 (pdf_image.content.to_vec(), "image/jpeg".to_string())
162 } else if filters.iter().any(|f| f == "FlateDecode") {
163 match decode_flate_image(pdf_image) {
165 Ok((data, mime)) => (data, mime),
166 Err(e) => {
167 debug!("Failed to decode FlateDecode image: {}", e);
168 return None;
169 }
170 }
171 } else if filters.iter().any(|f| f == "JPXDecode") {
172 (pdf_image.content.to_vec(), "image/jp2".to_string())
174 } else {
175 debug!("Unsupported image filter: {:?}", filters);
177 return None;
178 };
179
180 Some(ExtractedImage {
181 data,
182 mime_type,
183 caption: None, page: Some(page_num),
185 })
186}
187
188fn decode_flate_image(pdf_image: &lopdf::xobject::PdfImage) -> Result<(Vec<u8>, String), String> {
190 let mut decoder = ZlibDecoder::new(pdf_image.content);
192 let mut decompressed = Vec::new();
193 decoder
194 .read_to_end(&mut decompressed)
195 .map_err(|e| format!("Decompression failed: {e}"))?;
196
197 let color_space = pdf_image.color_space.as_deref().unwrap_or("DeviceRGB");
199 let width = pdf_image.width as u32;
200 let height = pdf_image.height as u32;
201
202 let img = match color_space {
203 "DeviceRGB" | "RGB" => image::RgbImage::from_raw(width, height, decompressed)
204 .map(image::DynamicImage::ImageRgb8),
205 "DeviceGray" | "Gray" => image::GrayImage::from_raw(width, height, decompressed)
206 .map(image::DynamicImage::ImageLuma8),
207 "DeviceCMYK" | "CMYK" => {
208 let rgb_data = cmyk_to_rgb(&decompressed);
210 image::RgbImage::from_raw(width, height, rgb_data).map(image::DynamicImage::ImageRgb8)
211 }
212 _ => {
213 debug!("Unknown color space '{}', attempting RGB", color_space);
215 image::RgbImage::from_raw(width, height, decompressed)
216 .map(image::DynamicImage::ImageRgb8)
217 }
218 };
219
220 let img = img.ok_or_else(|| "Failed to create image from raw data".to_string())?;
221
222 let mut png_data = Vec::new();
224 img.write_to(
225 &mut std::io::Cursor::new(&mut png_data),
226 image::ImageFormat::Png,
227 )
228 .map_err(|e| format!("PNG encoding failed: {e}"))?;
229
230 Ok((png_data, "image/png".to_string()))
231}
232
233#[allow(clippy::many_single_char_names)]
235fn cmyk_to_rgb(cmyk: &[u8]) -> Vec<u8> {
236 let mut rgb = Vec::with_capacity((cmyk.len() / 4) * 3);
237 for chunk in cmyk.chunks_exact(4) {
238 let c = f32::from(chunk[0]) / 255.0;
239 let m = f32::from(chunk[1]) / 255.0;
240 let y = f32::from(chunk[2]) / 255.0;
241 let k = f32::from(chunk[3]) / 255.0;
242
243 let r = 255.0 * (1.0 - c) * (1.0 - k);
244 let g = 255.0 * (1.0 - m) * (1.0 - k);
245 let b = 255.0 * (1.0 - y) * (1.0 - k);
246
247 #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
248 {
249 rgb.push(r as u8);
250 rgb.push(g as u8);
251 rgb.push(b as u8);
252 }
253 }
254 rgb
255}
256
257fn build_elements(text: &str) -> Vec<ContentElement> {
259 let mut elements = Vec::new();
260 let mut current_offset = 0u64;
261
262 for paragraph in text.split("\n\n") {
264 let trimmed = paragraph.trim();
265 if trimmed.is_empty() {
266 current_offset += paragraph.len() as u64 + 2; continue;
268 }
269
270 if looks_like_heading(trimmed) {
272 elements.push(ContentElement::Heading {
273 level: 1,
274 text: trimmed.to_string(),
275 byte_offset: current_offset,
276 });
277 } else {
278 elements.push(ContentElement::Paragraph {
279 text: trimmed.to_string(),
280 byte_offset: current_offset,
281 });
282 }
283
284 current_offset += paragraph.len() as u64 + 2;
285 }
286
287 elements
288}
289
290fn looks_like_heading(text: &str) -> bool {
292 if text.len() > 100 {
294 return false;
295 }
296
297 if text.ends_with('.') {
299 return false;
300 }
301
302 if text.contains('\n') {
304 return false;
305 }
306
307 let words: Vec<&str> = text.split_whitespace().collect();
309 if words.len() <= 8 {
310 let caps_count = words
312 .iter()
313 .filter(|w| w.chars().next().is_some_and(char::is_uppercase))
314 .count();
315 return caps_count >= words.len() / 2;
316 }
317
318 false
319}
320
321fn estimate_page_count(text: &str) -> u32 {
323 let form_feeds = text.matches('\x0C').count();
325 if form_feeds > 0 {
326 return (form_feeds + 1) as u32;
327 }
328
329 let chars = text.len();
331 std::cmp::max(1, (chars / 3000) as u32)
332}
333
334#[cfg(feature = "pdf_oxide")]
345pub struct PdfOxideExtractor;
346
347#[cfg(feature = "pdf_oxide")]
348impl PdfOxideExtractor {
349 #[must_use]
351 pub fn new() -> Self {
352 Self
353 }
354}
355
356#[cfg(feature = "pdf_oxide")]
357impl Default for PdfOxideExtractor {
358 fn default() -> Self {
359 Self::new()
360 }
361}
362
363#[cfg(feature = "pdf_oxide")]
364#[async_trait]
365impl ContentExtractor for PdfOxideExtractor {
366 fn supported_types(&self) -> &[&str] {
367 &["application/pdf"]
368 }
369
370 fn can_extract_by_extension(&self, path: &Path) -> bool {
371 path.extension()
372 .and_then(|ext| ext.to_str())
373 .is_some_and(|ext| ext.eq_ignore_ascii_case("pdf"))
374 }
375
376 async fn extract(&self, path: &Path) -> Result<ExtractedContent, ExtractError> {
377 debug!("Extracting PDF with pdf_oxide: {:?}", path);
378
379 let path_owned = path.to_path_buf();
380
381 let (text, page_count, images) =
383 tokio::task::spawn_blocking(move || extract_with_pdf_oxide(&path_owned))
384 .await
385 .map_err(|e| ExtractError::Failed(format!("Task join error: {e}")))?
386 .map_err(|e| ExtractError::Failed(format!("PDF extraction failed: {e}")))?;
387
388 let elements = build_elements(&text);
389
390 Ok(ExtractedContent {
391 text,
392 elements,
393 images,
394 metadata: ContentMetadataInfo {
395 page_count: Some(page_count),
396 ..Default::default()
397 },
398 })
399 }
400}
401
402#[cfg(feature = "pdf_oxide")]
403fn extract_with_pdf_oxide(
404 path: &std::path::Path,
405) -> Result<(String, u32, Vec<ExtractedImage>), String> {
406 use pdf_oxide::PdfDocument;
407
408 let mut doc = PdfDocument::open(path).map_err(|e| format!("Failed to open PDF: {e}"))?;
409
410 let page_count = doc
411 .page_count()
412 .map_err(|e| format!("Failed to get page count: {e}"))?;
413
414 let mut text = String::new();
416 for page_idx in 0..page_count {
417 match doc.extract_text(page_idx) {
418 Ok(page_text) => {
419 text.push_str(&page_text);
420 text.push_str("\n\n");
421 }
422 Err(e) => {
423 debug!("Failed to extract text from page {}: {}", page_idx + 1, e);
424 }
425 }
426 }
427
428 let mut images = Vec::new();
430 let mut total_bytes = 0usize;
431
432 for page_idx in 0..page_count {
433 if images.len() >= MAX_IMAGES || total_bytes >= MAX_TOTAL_BYTES {
434 break;
435 }
436
437 match doc.extract_images(page_idx) {
438 Ok(page_images) => {
439 for img in page_images {
440 if images.len() >= MAX_IMAGES || total_bytes >= MAX_TOTAL_BYTES {
441 break;
442 }
443
444 let (data, mime_type) = match img.data() {
446 pdf_oxide::extractors::images::ImageData::Jpeg(bytes) => {
447 (bytes.clone(), "image/jpeg")
448 }
449 _ => {
450 match img.to_png_bytes() {
452 Ok(png_bytes) => (png_bytes, "image/png"),
453 Err(e) => {
454 debug!("Failed to convert image to PNG: {e}");
455 continue;
456 }
457 }
458 }
459 };
460
461 total_bytes += data.len();
462 images.push(ExtractedImage {
463 data,
464 mime_type: mime_type.to_string(),
465 caption: None,
466 page: Some(page_idx as u32 + 1),
467 });
468 }
469 }
470 Err(e) => {
471 debug!("Failed to extract images from page {}: {}", page_idx + 1, e);
472 }
473 }
474 }
475
476 debug!(
477 "pdf_oxide: Extracted {} pages, {} chars, {} images",
478 page_count,
479 text.len(),
480 images.len()
481 );
482
483 Ok((text, page_count as u32, images))
484}
485
486#[cfg(test)]
487mod tests {
488 use super::*;
489
490 #[test]
491 fn test_looks_like_heading() {
492 assert!(looks_like_heading("Chapter 1"));
493 assert!(looks_like_heading("INTRODUCTION"));
494 assert!(looks_like_heading("The Quick Brown Fox"));
495 assert!(!looks_like_heading("This is a normal sentence."));
496 assert!(!looks_like_heading(
497 "This is a very long paragraph that goes on and on and definitely is not a heading"
498 ));
499 }
500
501 #[test]
502 fn test_estimate_page_count() {
503 assert_eq!(estimate_page_count("short"), 1);
504 assert_eq!(estimate_page_count(&"x".repeat(6000)), 2);
505 assert_eq!(estimate_page_count("page1\x0Cpage2\x0Cpage3"), 3);
506 }
507
508 #[test]
509 fn test_build_elements() {
510 let text = "Title\n\nFirst paragraph here.\n\nSecond paragraph.";
511 let elements = build_elements(text);
512 assert_eq!(elements.len(), 3);
513 }
514}