ragfs_extract/
pdf.rs

1//! PDF content extractor.
2//!
3//! Uses pdf-extract to extract text content and lopdf for embedded images.
4
5use async_trait::async_trait;
6use flate2::read::ZlibDecoder;
7use lopdf::Document;
8use ragfs_core::{
9    ContentElement, ContentExtractor, ContentMetadataInfo, ExtractError, ExtractedContent,
10    ExtractedImage,
11};
12use std::io::Read;
13use std::path::Path;
14use tracing::{debug, warn};
15
16/// Extractor for PDF files.
17pub struct PdfExtractor;
18
19impl PdfExtractor {
20    /// Create a new PDF extractor.
21    #[must_use]
22    pub fn new() -> Self {
23        Self
24    }
25}
26
27impl Default for PdfExtractor {
28    fn default() -> Self {
29        Self::new()
30    }
31}
32
33#[async_trait]
34impl ContentExtractor for PdfExtractor {
35    fn supported_types(&self) -> &[&str] {
36        &["application/pdf"]
37    }
38
39    fn can_extract_by_extension(&self, path: &Path) -> bool {
40        path.extension()
41            .and_then(|ext| ext.to_str())
42            .is_some_and(|ext| ext.eq_ignore_ascii_case("pdf"))
43    }
44
45    async fn extract(&self, path: &Path) -> Result<ExtractedContent, ExtractError> {
46        debug!("Extracting PDF: {:?}", path);
47
48        // Read PDF file
49        let bytes = tokio::fs::read(path).await?;
50
51        // Extract text using pdf-extract (blocking operation)
52        let text = tokio::task::spawn_blocking({
53            let bytes = bytes.clone();
54            move || extract_pdf_text(&bytes)
55        })
56        .await
57        .map_err(|e| ExtractError::Failed(format!("Task join error: {e}")))?
58        .map_err(|e| ExtractError::Failed(format!("PDF extraction failed: {e}")))?;
59
60        // Extract images using lopdf (blocking operation)
61        let images = tokio::task::spawn_blocking(move || extract_pdf_images(&bytes))
62            .await
63            .map_err(|e| ExtractError::Failed(format!("Image extraction task error: {e}")))?;
64
65        // Split into pages/paragraphs for elements
66        let elements = build_elements(&text);
67
68        // Estimate page count from page breaks or text length
69        let page_count = estimate_page_count(&text);
70
71        Ok(ExtractedContent {
72            text,
73            elements,
74            images,
75            metadata: ContentMetadataInfo {
76                page_count: Some(page_count),
77                ..Default::default()
78            },
79        })
80    }
81}
82
83/// Extract text from PDF bytes using pdf-extract.
84fn extract_pdf_text(bytes: &[u8]) -> Result<String, String> {
85    pdf_extract::extract_text_from_mem(bytes).map_err(|e| e.to_string())
86}
87
88/// Configuration for image extraction limits.
89const MAX_IMAGES: usize = 100;
90const MAX_TOTAL_BYTES: usize = 50 * 1024 * 1024; // 50MB
91const MIN_DIMENSION: u32 = 50; // Skip tiny images (icons, etc.)
92
93/// Extract images from PDF document using lopdf.
94fn extract_pdf_images(bytes: &[u8]) -> Vec<ExtractedImage> {
95    let doc = match Document::load_mem(bytes) {
96        Ok(d) => d,
97        Err(e) => {
98            warn!("Failed to load PDF for image extraction: {}", e);
99            return vec![];
100        }
101    };
102
103    let mut images = Vec::new();
104    let mut total_bytes = 0usize;
105
106    let pages = doc.get_pages();
107    for (page_num, page_id) in pages {
108        if images.len() >= MAX_IMAGES {
109            debug!(
110                "Reached maximum image count ({}), stopping extraction",
111                MAX_IMAGES
112            );
113            break;
114        }
115
116        match doc.get_page_images(page_id) {
117            Ok(page_images) => {
118                for pdf_image in page_images {
119                    if images.len() >= MAX_IMAGES || total_bytes >= MAX_TOTAL_BYTES {
120                        break;
121                    }
122
123                    // Skip tiny images
124                    if pdf_image.width < i64::from(MIN_DIMENSION)
125                        || pdf_image.height < i64::from(MIN_DIMENSION)
126                    {
127                        debug!(
128                            "Skipping small image: {}x{}",
129                            pdf_image.width, pdf_image.height
130                        );
131                        continue;
132                    }
133
134                    if let Some(extracted) = decode_pdf_image(&pdf_image, page_num) {
135                        total_bytes += extracted.data.len();
136                        images.push(extracted);
137                    }
138                }
139            }
140            Err(e) => {
141                debug!("Failed to get images from page {}: {}", page_num, e);
142            }
143        }
144    }
145
146    debug!(
147        "Extracted {} images from PDF ({} bytes total)",
148        images.len(),
149        total_bytes
150    );
151    images
152}
153
154/// Decode a PDF image into `ExtractedImage` format.
155fn decode_pdf_image(pdf_image: &lopdf::xobject::PdfImage, page_num: u32) -> Option<ExtractedImage> {
156    let filters = pdf_image.filters.as_ref()?;
157
158    // Determine MIME type and decode based on filter
159    let (data, mime_type) = if filters.iter().any(|f| f == "DCTDecode") {
160        // JPEG - can use raw content directly
161        (pdf_image.content.to_vec(), "image/jpeg".to_string())
162    } else if filters.iter().any(|f| f == "FlateDecode") {
163        // Compressed raw image data - decompress and convert to PNG
164        match decode_flate_image(pdf_image) {
165            Ok((data, mime)) => (data, mime),
166            Err(e) => {
167                debug!("Failed to decode FlateDecode image: {}", e);
168                return None;
169            }
170        }
171    } else if filters.iter().any(|f| f == "JPXDecode") {
172        // JPEG 2000 - use raw content
173        (pdf_image.content.to_vec(), "image/jp2".to_string())
174    } else {
175        // Unsupported filter
176        debug!("Unsupported image filter: {:?}", filters);
177        return None;
178    };
179
180    Some(ExtractedImage {
181        data,
182        mime_type,
183        caption: None, // Will be filled by vision model in future
184        page: Some(page_num),
185    })
186}
187
188/// Decode `FlateDecode` compressed image to PNG.
189fn decode_flate_image(pdf_image: &lopdf::xobject::PdfImage) -> Result<(Vec<u8>, String), String> {
190    // Decompress the data
191    let mut decoder = ZlibDecoder::new(pdf_image.content);
192    let mut decompressed = Vec::new();
193    decoder
194        .read_to_end(&mut decompressed)
195        .map_err(|e| format!("Decompression failed: {e}"))?;
196
197    // Determine color space and create image
198    let color_space = pdf_image.color_space.as_deref().unwrap_or("DeviceRGB");
199    let width = pdf_image.width as u32;
200    let height = pdf_image.height as u32;
201
202    let img = match color_space {
203        "DeviceRGB" | "RGB" => image::RgbImage::from_raw(width, height, decompressed)
204            .map(image::DynamicImage::ImageRgb8),
205        "DeviceGray" | "Gray" => image::GrayImage::from_raw(width, height, decompressed)
206            .map(image::DynamicImage::ImageLuma8),
207        "DeviceCMYK" | "CMYK" => {
208            // Convert CMYK to RGB
209            let rgb_data = cmyk_to_rgb(&decompressed);
210            image::RgbImage::from_raw(width, height, rgb_data).map(image::DynamicImage::ImageRgb8)
211        }
212        _ => {
213            // Attempt RGB as fallback
214            debug!("Unknown color space '{}', attempting RGB", color_space);
215            image::RgbImage::from_raw(width, height, decompressed)
216                .map(image::DynamicImage::ImageRgb8)
217        }
218    };
219
220    let img = img.ok_or_else(|| "Failed to create image from raw data".to_string())?;
221
222    // Encode to PNG
223    let mut png_data = Vec::new();
224    img.write_to(
225        &mut std::io::Cursor::new(&mut png_data),
226        image::ImageFormat::Png,
227    )
228    .map_err(|e| format!("PNG encoding failed: {e}"))?;
229
230    Ok((png_data, "image/png".to_string()))
231}
232
233/// Convert CMYK bytes to RGB.
234#[allow(clippy::many_single_char_names)]
235fn cmyk_to_rgb(cmyk: &[u8]) -> Vec<u8> {
236    let mut rgb = Vec::with_capacity((cmyk.len() / 4) * 3);
237    for chunk in cmyk.chunks_exact(4) {
238        let c = f32::from(chunk[0]) / 255.0;
239        let m = f32::from(chunk[1]) / 255.0;
240        let y = f32::from(chunk[2]) / 255.0;
241        let k = f32::from(chunk[3]) / 255.0;
242
243        let r = 255.0 * (1.0 - c) * (1.0 - k);
244        let g = 255.0 * (1.0 - m) * (1.0 - k);
245        let b = 255.0 * (1.0 - y) * (1.0 - k);
246
247        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
248        {
249            rgb.push(r as u8);
250            rgb.push(g as u8);
251            rgb.push(b as u8);
252        }
253    }
254    rgb
255}
256
257/// Build `ContentElements` from extracted text.
258fn build_elements(text: &str) -> Vec<ContentElement> {
259    let mut elements = Vec::new();
260    let mut current_offset = 0u64;
261
262    // Split by double newlines to get paragraphs
263    for paragraph in text.split("\n\n") {
264        let trimmed = paragraph.trim();
265        if trimmed.is_empty() {
266            current_offset += paragraph.len() as u64 + 2; // +2 for \n\n
267            continue;
268        }
269
270        // Check if it looks like a heading (short, possibly capitalized)
271        if looks_like_heading(trimmed) {
272            elements.push(ContentElement::Heading {
273                level: 1,
274                text: trimmed.to_string(),
275                byte_offset: current_offset,
276            });
277        } else {
278            elements.push(ContentElement::Paragraph {
279                text: trimmed.to_string(),
280                byte_offset: current_offset,
281            });
282        }
283
284        current_offset += paragraph.len() as u64 + 2;
285    }
286
287    elements
288}
289
290/// Heuristic to detect if text looks like a heading.
291fn looks_like_heading(text: &str) -> bool {
292    // Short text (likely a title/heading)
293    if text.len() > 100 {
294        return false;
295    }
296
297    // No period at end (headings typically don't end with periods)
298    if text.ends_with('.') {
299        return false;
300    }
301
302    // Single line
303    if text.contains('\n') {
304        return false;
305    }
306
307    // All caps or title case with few words
308    let words: Vec<&str> = text.split_whitespace().collect();
309    if words.len() <= 8 {
310        // Check if mostly capitalized
311        let caps_count = words
312            .iter()
313            .filter(|w| w.chars().next().is_some_and(char::is_uppercase))
314            .count();
315        return caps_count >= words.len() / 2;
316    }
317
318    false
319}
320
321/// Estimate page count from text.
322fn estimate_page_count(text: &str) -> u32 {
323    // Look for form feed characters (page breaks)
324    let form_feeds = text.matches('\x0C').count();
325    if form_feeds > 0 {
326        return (form_feeds + 1) as u32;
327    }
328
329    // Estimate based on character count (~3000 chars per page average)
330    let chars = text.len();
331    std::cmp::max(1, (chars / 3000) as u32)
332}
333
334// ============================================================================
335// Alternative PDF extractor using pdf_oxide (optional feature)
336// ============================================================================
337
338/// Alternative PDF extractor using the `pdf_oxide` library.
339///
340/// This extractor provides potentially better performance and a cleaner API
341/// compared to the default pdf-extract + lopdf combination.
342///
343/// Enable with: `cargo build --features pdf_oxide`
344#[cfg(feature = "pdf_oxide")]
345pub struct PdfOxideExtractor;
346
347#[cfg(feature = "pdf_oxide")]
348impl PdfOxideExtractor {
349    /// Create a new PDF oxide extractor.
350    #[must_use]
351    pub fn new() -> Self {
352        Self
353    }
354}
355
356#[cfg(feature = "pdf_oxide")]
357impl Default for PdfOxideExtractor {
358    fn default() -> Self {
359        Self::new()
360    }
361}
362
363#[cfg(feature = "pdf_oxide")]
364#[async_trait]
365impl ContentExtractor for PdfOxideExtractor {
366    fn supported_types(&self) -> &[&str] {
367        &["application/pdf"]
368    }
369
370    fn can_extract_by_extension(&self, path: &Path) -> bool {
371        path.extension()
372            .and_then(|ext| ext.to_str())
373            .is_some_and(|ext| ext.eq_ignore_ascii_case("pdf"))
374    }
375
376    async fn extract(&self, path: &Path) -> Result<ExtractedContent, ExtractError> {
377        debug!("Extracting PDF with pdf_oxide: {:?}", path);
378
379        let path_owned = path.to_path_buf();
380
381        // pdf_oxide operations are blocking
382        let (text, page_count, images) =
383            tokio::task::spawn_blocking(move || extract_with_pdf_oxide(&path_owned))
384                .await
385                .map_err(|e| ExtractError::Failed(format!("Task join error: {e}")))?
386                .map_err(|e| ExtractError::Failed(format!("PDF extraction failed: {e}")))?;
387
388        let elements = build_elements(&text);
389
390        Ok(ExtractedContent {
391            text,
392            elements,
393            images,
394            metadata: ContentMetadataInfo {
395                page_count: Some(page_count),
396                ..Default::default()
397            },
398        })
399    }
400}
401
402#[cfg(feature = "pdf_oxide")]
403fn extract_with_pdf_oxide(
404    path: &std::path::Path,
405) -> Result<(String, u32, Vec<ExtractedImage>), String> {
406    use pdf_oxide::PdfDocument;
407
408    let mut doc = PdfDocument::open(path).map_err(|e| format!("Failed to open PDF: {e}"))?;
409
410    let page_count = doc
411        .page_count()
412        .map_err(|e| format!("Failed to get page count: {e}"))?;
413
414    // Extract text from all pages
415    let mut text = String::new();
416    for page_idx in 0..page_count {
417        match doc.extract_text(page_idx) {
418            Ok(page_text) => {
419                text.push_str(&page_text);
420                text.push_str("\n\n");
421            }
422            Err(e) => {
423                debug!("Failed to extract text from page {}: {}", page_idx + 1, e);
424            }
425        }
426    }
427
428    // Extract images from all pages
429    let mut images = Vec::new();
430    let mut total_bytes = 0usize;
431
432    for page_idx in 0..page_count {
433        if images.len() >= MAX_IMAGES || total_bytes >= MAX_TOTAL_BYTES {
434            break;
435        }
436
437        match doc.extract_images(page_idx) {
438            Ok(page_images) => {
439                for img in page_images {
440                    if images.len() >= MAX_IMAGES || total_bytes >= MAX_TOTAL_BYTES {
441                        break;
442                    }
443
444                    // Get image data and determine MIME type
445                    let (data, mime_type) = match img.data() {
446                        pdf_oxide::extractors::images::ImageData::Jpeg(bytes) => {
447                            (bytes.clone(), "image/jpeg")
448                        }
449                        _ => {
450                            // For raw pixel data, convert to PNG
451                            match img.to_png_bytes() {
452                                Ok(png_bytes) => (png_bytes, "image/png"),
453                                Err(e) => {
454                                    debug!("Failed to convert image to PNG: {e}");
455                                    continue;
456                                }
457                            }
458                        }
459                    };
460
461                    total_bytes += data.len();
462                    images.push(ExtractedImage {
463                        data,
464                        mime_type: mime_type.to_string(),
465                        caption: None,
466                        page: Some(page_idx as u32 + 1),
467                    });
468                }
469            }
470            Err(e) => {
471                debug!("Failed to extract images from page {}: {}", page_idx + 1, e);
472            }
473        }
474    }
475
476    debug!(
477        "pdf_oxide: Extracted {} pages, {} chars, {} images",
478        page_count,
479        text.len(),
480        images.len()
481    );
482
483    Ok((text, page_count as u32, images))
484}
485
486#[cfg(test)]
487mod tests {
488    use super::*;
489
490    #[test]
491    fn test_looks_like_heading() {
492        assert!(looks_like_heading("Chapter 1"));
493        assert!(looks_like_heading("INTRODUCTION"));
494        assert!(looks_like_heading("The Quick Brown Fox"));
495        assert!(!looks_like_heading("This is a normal sentence."));
496        assert!(!looks_like_heading(
497            "This is a very long paragraph that goes on and on and definitely is not a heading"
498        ));
499    }
500
501    #[test]
502    fn test_estimate_page_count() {
503        assert_eq!(estimate_page_count("short"), 1);
504        assert_eq!(estimate_page_count(&"x".repeat(6000)), 2);
505        assert_eq!(estimate_page_count("page1\x0Cpage2\x0Cpage3"), 3);
506    }
507
508    #[test]
509    fn test_build_elements() {
510        let text = "Title\n\nFirst paragraph here.\n\nSecond paragraph.";
511        let elements = build_elements(text);
512        assert_eq!(elements.len(), 3);
513    }
514}