1use chrono::{DateTime, Utc};
31use serde::{Deserialize, Serialize};
32use std::collections::HashMap;
33use std::ops::Range;
34use std::path::PathBuf;
35use uuid::Uuid;
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct FileRecord {
44 pub id: Uuid,
46 pub path: PathBuf,
48 pub size_bytes: u64,
50 pub mime_type: String,
52 pub content_hash: String,
54 pub modified_at: DateTime<Utc>,
56 pub indexed_at: Option<DateTime<Utc>>,
58 pub chunk_count: u32,
60 pub status: FileStatus,
62 pub error_message: Option<String>,
64}
65
66#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
68#[serde(rename_all = "lowercase")]
69pub enum FileStatus {
70 Pending,
72 Indexing,
74 Indexed,
76 Error,
78 Deleted,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct Chunk {
89 pub id: Uuid,
91 pub file_id: Uuid,
93 pub file_path: PathBuf,
95 pub content: String,
97 pub content_type: ContentType,
99 pub mime_type: Option<String>,
101 pub chunk_index: u32,
103 pub byte_range: Range<u64>,
105 pub line_range: Option<Range<u32>>,
107 pub parent_chunk_id: Option<Uuid>,
109 pub depth: u8,
111 pub embedding: Option<Vec<f32>>,
113 pub metadata: ChunkMetadata,
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
119#[serde(tag = "type", rename_all = "snake_case")]
120pub enum ContentType {
121 Text,
123 Code {
125 language: String,
127 symbol: Option<CodeSymbol>,
129 },
130 ImageCaption,
132 PdfPage {
134 page_num: u32,
136 },
137 Markdown,
139}
140
141#[derive(Debug, Clone, Serialize, Deserialize)]
143pub struct CodeSymbol {
144 pub kind: SymbolKind,
146 pub name: String,
148}
149
150#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
152#[serde(rename_all = "snake_case")]
153pub enum SymbolKind {
154 Function,
155 Method,
156 Class,
157 Struct,
158 Enum,
159 Module,
160 Constant,
161 Variable,
162 Interface,
163 Trait,
164}
165
166#[derive(Debug, Clone, Default, Serialize, Deserialize)]
168pub struct ChunkMetadata {
169 pub embedding_model: Option<String>,
171 pub indexed_at: Option<DateTime<Utc>>,
173 pub token_count: Option<usize>,
175 #[serde(flatten)]
177 pub extra: HashMap<String, String>,
178}
179
180#[derive(Debug, Clone)]
186pub struct ExtractedContent {
187 pub text: String,
189 pub elements: Vec<ContentElement>,
191 pub images: Vec<ExtractedImage>,
193 pub metadata: ContentMetadataInfo,
195}
196
197#[derive(Debug, Clone)]
199pub enum ContentElement {
200 Heading {
201 level: u8,
202 text: String,
203 byte_offset: u64,
204 },
205 Paragraph {
206 text: String,
207 byte_offset: u64,
208 },
209 CodeBlock {
210 language: Option<String>,
211 code: String,
212 byte_offset: u64,
213 },
214 List {
215 items: Vec<String>,
216 ordered: bool,
217 byte_offset: u64,
218 },
219 Table {
220 headers: Vec<String>,
221 rows: Vec<Vec<String>>,
222 byte_offset: u64,
223 },
224}
225
226#[derive(Debug, Clone)]
228pub struct ExtractedImage {
229 pub data: Vec<u8>,
231 pub mime_type: String,
233 pub caption: Option<String>,
235 pub page: Option<u32>,
237}
238
239#[derive(Debug, Clone, Default)]
241pub struct ContentMetadataInfo {
242 pub title: Option<String>,
244 pub author: Option<String>,
246 pub language: Option<String>,
248 pub page_count: Option<u32>,
250 pub created_at: Option<DateTime<Utc>>,
252}
253
254#[derive(Debug, Clone, Serialize, Deserialize)]
260pub struct ChunkConfig {
261 pub target_size: usize,
263 pub max_size: usize,
265 pub overlap: usize,
267 pub hierarchical: bool,
269 pub max_depth: u8,
271}
272
273impl Default for ChunkConfig {
274 fn default() -> Self {
275 Self {
276 target_size: 512,
277 max_size: 1024,
278 overlap: 64,
279 hierarchical: true,
280 max_depth: 2,
281 }
282 }
283}
284
285#[derive(Debug, Clone)]
287pub struct ChunkOutput {
288 pub content: String,
290 pub byte_range: Range<u64>,
292 pub line_range: Option<Range<u32>>,
294 pub parent_index: Option<usize>,
296 pub depth: u8,
298 pub metadata: ChunkOutputMetadata,
300}
301
302#[derive(Debug, Clone, Default)]
304pub struct ChunkOutputMetadata {
305 pub symbol_type: Option<String>,
307 pub symbol_name: Option<String>,
309 pub language: Option<String>,
311}
312
313#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
319#[serde(rename_all = "lowercase")]
320pub enum Modality {
321 Text,
322 Image,
323 Audio,
324}
325
326#[derive(Debug, Clone, Serialize, Deserialize)]
328pub struct EmbeddingConfig {
329 pub normalize: bool,
331 pub instruction: Option<String>,
333 pub batch_size: usize,
335}
336
337impl Default for EmbeddingConfig {
338 fn default() -> Self {
339 Self {
340 normalize: true,
341 instruction: None,
342 batch_size: 32,
343 }
344 }
345}
346
347#[derive(Debug, Clone)]
349pub struct EmbeddingOutput {
350 pub embedding: Vec<f32>,
352 pub token_count: usize,
354}
355
356#[derive(Debug, Clone)]
362pub struct SearchQuery {
363 pub embedding: Vec<f32>,
365 pub text: Option<String>,
367 pub limit: usize,
369 pub filters: Vec<SearchFilter>,
371 pub metric: DistanceMetric,
373}
374
375#[derive(Debug, Clone)]
377pub enum SearchFilter {
378 PathPrefix(String),
380 PathGlob(String),
382 MimeType(String),
384 Language(String),
386 ModifiedAfter(DateTime<Utc>),
388 ModifiedBefore(DateTime<Utc>),
390 MinDepth(u8),
392 MaxDepth(u8),
394}
395
396#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
398#[serde(rename_all = "lowercase")]
399pub enum DistanceMetric {
400 #[default]
401 Cosine,
402 L2,
403 Dot,
404}
405
406#[derive(Debug, Clone, Serialize, Deserialize)]
408pub struct SearchResult {
409 pub chunk_id: Uuid,
411 pub file_path: PathBuf,
413 pub content: String,
415 pub score: f32,
417 pub byte_range: Range<u64>,
419 pub line_range: Option<Range<u32>>,
421 pub metadata: HashMap<String, String>,
423}
424
425#[derive(Debug, Clone, Serialize, Deserialize)]
427pub struct StoreStats {
428 pub total_chunks: u64,
430 pub total_files: u64,
432 pub index_size_bytes: u64,
434 pub last_updated: Option<DateTime<Utc>>,
436}
437
438#[derive(Debug, Clone, Default, Serialize, Deserialize)]
444pub struct IndexStats {
445 pub total_files: u64,
447 pub indexed_files: u64,
449 pub pending_files: u64,
451 pub error_files: u64,
453 pub total_chunks: u64,
455 pub last_update: Option<DateTime<Utc>>,
457}
458
459#[derive(Debug, Clone)]
465pub enum FileEvent {
466 Created(PathBuf),
467 Modified(PathBuf),
468 Deleted(PathBuf),
469 Renamed { from: PathBuf, to: PathBuf },
470}
471
472#[cfg(test)]
473mod tests {
474 use super::*;
475
476 #[test]
479 fn test_file_record_serialization() {
480 let record = FileRecord {
481 id: Uuid::new_v4(),
482 path: PathBuf::from("/test/file.txt"),
483 size_bytes: 1024,
484 mime_type: "text/plain".to_string(),
485 content_hash: "abc123".to_string(),
486 modified_at: Utc::now(),
487 indexed_at: Some(Utc::now()),
488 chunk_count: 5,
489 status: FileStatus::Indexed,
490 error_message: None,
491 };
492
493 let json = serde_json::to_string(&record).unwrap();
494 let deserialized: FileRecord = serde_json::from_str(&json).unwrap();
495
496 assert_eq!(record.id, deserialized.id);
497 assert_eq!(record.path, deserialized.path);
498 assert_eq!(record.size_bytes, deserialized.size_bytes);
499 assert_eq!(record.status, deserialized.status);
500 }
501
502 #[test]
503 fn test_file_status_serialization() {
504 assert_eq!(
505 serde_json::to_string(&FileStatus::Pending).unwrap(),
506 "\"pending\""
507 );
508 assert_eq!(
509 serde_json::to_string(&FileStatus::Indexed).unwrap(),
510 "\"indexed\""
511 );
512 assert_eq!(
513 serde_json::to_string(&FileStatus::Error).unwrap(),
514 "\"error\""
515 );
516 }
517
518 #[test]
519 fn test_file_status_equality() {
520 assert_eq!(FileStatus::Pending, FileStatus::Pending);
521 assert_ne!(FileStatus::Pending, FileStatus::Indexed);
522 }
523
524 #[test]
527 fn test_chunk_serialization() {
528 let chunk = Chunk {
529 id: Uuid::new_v4(),
530 file_id: Uuid::new_v4(),
531 file_path: PathBuf::from("/test/file.rs"),
532 content: "fn main() {}".to_string(),
533 content_type: ContentType::Code {
534 language: "rust".to_string(),
535 symbol: Some(CodeSymbol {
536 kind: SymbolKind::Function,
537 name: "main".to_string(),
538 }),
539 },
540 mime_type: Some("text/x-rust".to_string()),
541 chunk_index: 0,
542 byte_range: 0..12,
543 line_range: Some(0..1),
544 parent_chunk_id: None,
545 depth: 0,
546 embedding: None,
547 metadata: ChunkMetadata::default(),
548 };
549
550 let json = serde_json::to_string(&chunk).unwrap();
551 let deserialized: Chunk = serde_json::from_str(&json).unwrap();
552
553 assert_eq!(chunk.id, deserialized.id);
554 assert_eq!(chunk.content, deserialized.content);
555 }
556
557 #[test]
558 fn test_content_type_text() {
559 let ct = ContentType::Text;
560 let json = serde_json::to_string(&ct).unwrap();
561 assert!(json.contains("\"type\":\"text\""));
562 }
563
564 #[test]
565 fn test_content_type_code() {
566 let ct = ContentType::Code {
567 language: "python".to_string(),
568 symbol: None,
569 };
570 let json = serde_json::to_string(&ct).unwrap();
571 assert!(json.contains("\"type\":\"code\""));
572 assert!(json.contains("\"language\":\"python\""));
573 }
574
575 #[test]
576 fn test_content_type_pdf_page() {
577 let ct = ContentType::PdfPage { page_num: 5 };
578 let json = serde_json::to_string(&ct).unwrap();
579 assert!(json.contains("\"type\":\"pdf_page\""));
580 assert!(json.contains("\"page_num\":5"));
581 }
582
583 #[test]
584 fn test_content_type_markdown() {
585 let ct = ContentType::Markdown;
586 let json = serde_json::to_string(&ct).unwrap();
587 assert!(json.contains("\"type\":\"markdown\""));
588 }
589
590 #[test]
591 fn test_symbol_kind_serialization() {
592 assert_eq!(
593 serde_json::to_string(&SymbolKind::Function).unwrap(),
594 "\"function\""
595 );
596 assert_eq!(
597 serde_json::to_string(&SymbolKind::Struct).unwrap(),
598 "\"struct\""
599 );
600 assert_eq!(
601 serde_json::to_string(&SymbolKind::Trait).unwrap(),
602 "\"trait\""
603 );
604 }
605
606 #[test]
609 fn test_chunk_config_default() {
610 let config = ChunkConfig::default();
611 assert_eq!(config.target_size, 512);
612 assert_eq!(config.max_size, 1024);
613 assert_eq!(config.overlap, 64);
614 assert!(config.hierarchical);
615 assert_eq!(config.max_depth, 2);
616 }
617
618 #[test]
619 fn test_chunk_config_serialization() {
620 let config = ChunkConfig::default();
621 let json = serde_json::to_string(&config).unwrap();
622 let deserialized: ChunkConfig = serde_json::from_str(&json).unwrap();
623
624 assert_eq!(config.target_size, deserialized.target_size);
625 assert_eq!(config.max_size, deserialized.max_size);
626 }
627
628 #[test]
631 fn test_embedding_config_default() {
632 let config = EmbeddingConfig::default();
633 assert!(config.normalize);
634 assert!(config.instruction.is_none());
635 assert_eq!(config.batch_size, 32);
636 }
637
638 #[test]
639 fn test_embedding_config_serialization() {
640 let config = EmbeddingConfig {
641 normalize: false,
642 instruction: Some("Search: ".to_string()),
643 batch_size: 16,
644 };
645 let json = serde_json::to_string(&config).unwrap();
646 let deserialized: EmbeddingConfig = serde_json::from_str(&json).unwrap();
647
648 assert_eq!(config.normalize, deserialized.normalize);
649 assert_eq!(config.instruction, deserialized.instruction);
650 assert_eq!(config.batch_size, deserialized.batch_size);
651 }
652
653 #[test]
656 fn test_modality_serialization() {
657 assert_eq!(serde_json::to_string(&Modality::Text).unwrap(), "\"text\"");
658 assert_eq!(
659 serde_json::to_string(&Modality::Image).unwrap(),
660 "\"image\""
661 );
662 assert_eq!(
663 serde_json::to_string(&Modality::Audio).unwrap(),
664 "\"audio\""
665 );
666 }
667
668 #[test]
669 fn test_modality_equality() {
670 assert_eq!(Modality::Text, Modality::Text);
671 assert_ne!(Modality::Text, Modality::Image);
672 }
673
674 #[test]
677 fn test_distance_metric_default() {
678 let metric = DistanceMetric::default();
679 assert_eq!(metric, DistanceMetric::Cosine);
680 }
681
682 #[test]
683 fn test_distance_metric_serialization() {
684 assert_eq!(
685 serde_json::to_string(&DistanceMetric::Cosine).unwrap(),
686 "\"cosine\""
687 );
688 assert_eq!(
689 serde_json::to_string(&DistanceMetric::L2).unwrap(),
690 "\"l2\""
691 );
692 assert_eq!(
693 serde_json::to_string(&DistanceMetric::Dot).unwrap(),
694 "\"dot\""
695 );
696 }
697
698 #[test]
701 fn test_search_result_serialization() {
702 let result = SearchResult {
703 chunk_id: Uuid::new_v4(),
704 file_path: PathBuf::from("/test/file.txt"),
705 content: "Test content".to_string(),
706 score: 0.95,
707 byte_range: 0..12,
708 line_range: Some(0..1),
709 metadata: HashMap::new(),
710 };
711
712 let json = serde_json::to_string(&result).unwrap();
713 let deserialized: SearchResult = serde_json::from_str(&json).unwrap();
714
715 assert_eq!(result.chunk_id, deserialized.chunk_id);
716 assert_eq!(result.score, deserialized.score);
717 assert_eq!(result.content, deserialized.content);
718 }
719
720 #[test]
723 fn test_store_stats_serialization() {
724 let stats = StoreStats {
725 total_chunks: 100,
726 total_files: 10,
727 index_size_bytes: 1024 * 1024,
728 last_updated: Some(Utc::now()),
729 };
730
731 let json = serde_json::to_string(&stats).unwrap();
732 let deserialized: StoreStats = serde_json::from_str(&json).unwrap();
733
734 assert_eq!(stats.total_chunks, deserialized.total_chunks);
735 assert_eq!(stats.total_files, deserialized.total_files);
736 }
737
738 #[test]
741 fn test_index_stats_default() {
742 let stats = IndexStats::default();
743 assert_eq!(stats.total_files, 0);
744 assert_eq!(stats.indexed_files, 0);
745 assert_eq!(stats.pending_files, 0);
746 assert_eq!(stats.error_files, 0);
747 assert_eq!(stats.total_chunks, 0);
748 assert!(stats.last_update.is_none());
749 }
750
751 #[test]
752 fn test_index_stats_serialization() {
753 let stats = IndexStats {
754 total_files: 50,
755 indexed_files: 45,
756 pending_files: 3,
757 error_files: 2,
758 total_chunks: 500,
759 last_update: Some(Utc::now()),
760 };
761
762 let json = serde_json::to_string(&stats).unwrap();
763 let deserialized: IndexStats = serde_json::from_str(&json).unwrap();
764
765 assert_eq!(stats.total_files, deserialized.total_files);
766 assert_eq!(stats.indexed_files, deserialized.indexed_files);
767 }
768
769 #[test]
772 fn test_chunk_metadata_default() {
773 let meta = ChunkMetadata::default();
774 assert!(meta.embedding_model.is_none());
775 assert!(meta.indexed_at.is_none());
776 assert!(meta.token_count.is_none());
777 assert!(meta.extra.is_empty());
778 }
779
780 #[test]
783 fn test_chunk_output_metadata_default() {
784 let meta = ChunkOutputMetadata::default();
785 assert!(meta.symbol_type.is_none());
786 assert!(meta.symbol_name.is_none());
787 assert!(meta.language.is_none());
788 }
789
790 #[test]
793 fn test_content_metadata_info_default() {
794 let meta = ContentMetadataInfo::default();
795 assert!(meta.title.is_none());
796 assert!(meta.author.is_none());
797 assert!(meta.language.is_none());
798 assert!(meta.page_count.is_none());
799 assert!(meta.created_at.is_none());
800 }
801
802 #[test]
805 fn test_file_event_created() {
806 let event = FileEvent::Created(PathBuf::from("/test/new.txt"));
807 match event {
808 FileEvent::Created(path) => assert_eq!(path, PathBuf::from("/test/new.txt")),
809 _ => panic!("Expected Created event"),
810 }
811 }
812
813 #[test]
814 fn test_file_event_modified() {
815 let event = FileEvent::Modified(PathBuf::from("/test/changed.txt"));
816 match event {
817 FileEvent::Modified(path) => assert_eq!(path, PathBuf::from("/test/changed.txt")),
818 _ => panic!("Expected Modified event"),
819 }
820 }
821
822 #[test]
823 fn test_file_event_deleted() {
824 let event = FileEvent::Deleted(PathBuf::from("/test/removed.txt"));
825 match event {
826 FileEvent::Deleted(path) => assert_eq!(path, PathBuf::from("/test/removed.txt")),
827 _ => panic!("Expected Deleted event"),
828 }
829 }
830
831 #[test]
832 fn test_file_event_renamed() {
833 let event = FileEvent::Renamed {
834 from: PathBuf::from("/test/old.txt"),
835 to: PathBuf::from("/test/new.txt"),
836 };
837 match event {
838 FileEvent::Renamed { from, to } => {
839 assert_eq!(from, PathBuf::from("/test/old.txt"));
840 assert_eq!(to, PathBuf::from("/test/new.txt"));
841 }
842 _ => panic!("Expected Renamed event"),
843 }
844 }
845}