feat: Initial commit of anything2json

2025-10-29 04:30:14 +01:00 · 2025-10-29 04:30:14 +01:00 · 4c71bec26a
parent 2ba1121c01
commit 4c71bec26a
12 changed files with 691 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+Cargo.lock
+‘target’
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,31 @@
+[package]
+name = "a2j"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+regex = "1.10"
+glob = "0.3"
+tempfile = "3.10"
+mime_guess = "2.0"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+anyhow = "1.0"
+thiserror = "1.0"
+tracing = "0.1"
+infer = "0.15"
+chardetng = "0.1"
+encoding_rs = "0.8"
+content_inspector = "0.2"
+scraper = "0.19"
+quick-xml = { version = "0.31", features = ["serialize"] }
+pdf-extract = "0.7"
+zip = "0.6"
+calamine = "0.22"
+kamadak-exif = "0.5"
+symphonia = "0.5"
+mailparse = "0.14"
+flate2 = { version = "1.0", features = ["miniz_oxide"], default-features = false }
+tar = "0.4"
+goblin = "0.7"
+clap = { version = "4.4", features = ["derive"] }
--- a/src/extractors/archive.rs
+++ b/src/extractors/archive.rs
@ -0,0 +1,89 @@
+
+use crate::{A2j, Extractor, Source};
+use std::io::Read;
+use zip::ZipArchive;
+use tar::Archive;
+use flate2::read::GzDecoder;
+
+pub struct ArchiveExtractor;
+
+impl Extractor for ArchiveExtractor {
+    fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
+        let mut buffer = Vec::new();
+        src.read_to_end(&mut buffer)?;
+
+        let mut attachments = Vec::new();
+        let mut warnings = Vec::new();
+
+        match mime_type {
+            "application/zip" => {
+                let cursor = std::io::Cursor::new(buffer.clone());
+                let mut archive = ZipArchive::new(cursor)?;
+
+                for i in 0..archive.len() {
+                    let file = archive.by_index(i)?;
+                    if file.is_file() {
+                        attachments.push(A2j {
+                            source: Source {
+                                path: file.name().to_string(),
+                                size: file.size(),
+                                sha256: "unknown".to_string(),
+                            },
+                            content_type: "application/octet-stream".to_string(), // Placeholder
+                            encoding: None,
+                            meta: serde_json::Value::Null,
+                            text: None,
+                            structured: None,
+                            attachments: vec![],
+                            warnings: vec!["Recursive extraction not fully implemented".to_string()],
+                            annotations: vec![],
+                        });
+                    }
+                }
+            }
+            "application/gzip" | "application/x-tar" => {
+                let cursor = std::io::Cursor::new(buffer.clone());
+                let decoder = GzDecoder::new(cursor);
+                let mut archive = Archive::new(decoder);
+
+                for file in archive.entries()? {
+                    let file = file?;
+                    if file.header().entry_type().is_file() {
+                        attachments.push(A2j {
+                            source: Source {
+                                path: file.path()?.to_str().unwrap_or("unknown").to_string(),
+                                size: file.size(),
+                                sha256: "unknown".to_string(),
+                            },
+                            content_type: "application/octet-stream".to_string(), // Placeholder
+                            encoding: None,
+                            meta: serde_json::Value::Null,
+                            text: None,
+                            structured: None,
+                            attachments: vec![],
+                            warnings: vec!["Recursive extraction not fully implemented".to_string()],
+                            annotations: vec![],
+                        });
+                    }
+                }
+            }
+            _ => warnings.push(format!("Unsupported archive type: {}", mime_type)),
+        }
+
+        Ok(A2j {
+            source: Source {
+                path: "unknown".to_string(),
+                size: buffer.len() as u64,
+                sha256: "unknown".to_string(),
+            },
+            content_type: mime_type.to_string(),
+            encoding: None,
+            meta: serde_json::Value::Null,
+            text: None,
+            structured: None,
+            attachments,
+            warnings,
+            annotations: vec![],
+        })
+    }
+}
--- a/src/extractors/email.rs
+++ b/src/extractors/email.rs
@ -0,0 +1,39 @@
+
+use crate::{A2j, Extractor, Source};
+use std::io::Read;
+use mailparse::parse_mail;
+use serde_json::json;
+
+pub struct EmailExtractor;
+
+impl Extractor for EmailExtractor {
+    fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
+        let mut buffer = Vec::new();
+        src.read_to_end(&mut buffer)?;
+
+        let mail = parse_mail(&buffer)?;
+
+        let mut headers = json!({});
+        for header in &mail.headers {
+            headers[&header.get_key().to_lowercase()] = json!(header.get_value());
+        }
+
+        let text_body = mail.get_body().ok();
+
+        Ok(A2j {
+            source: Source {
+                path: "unknown".to_string(),
+                size: buffer.len() as u64,
+                sha256: "unknown".to_string(),
+            },
+            content_type: mime_type.to_string(),
+            encoding: None, // mailparse handles encoding internally
+            meta: headers,
+            text: text_body,
+            structured: None,
+            attachments: vec![], // TODO: Handle attachments
+            warnings: vec![],
+            annotations: vec![],
+        })
+    }
+}
--- a/src/extractors/html.rs
+++ b/src/extractors/html.rs
@ -0,0 +1,32 @@
+
+use crate::{A2j, Extractor, Source};
+use scraper::{Html, Selector};
+use std::io::Read;
+
+pub struct HtmlExtractor;
+
+impl Extractor for HtmlExtractor {
+    fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
+        let mut buffer = String::new();
+        src.read_to_string(&mut buffer)?;
+        let document = Html::parse_document(&buffer);
+        let selector = Selector::parse("body").unwrap();
+        let text = document.select(&selector).next().map(|e| e.text().collect::<String>());
+
+        Ok(A2j {
+            source: Source {
+                path: "unknown".to_string(),
+                size: buffer.len() as u64,
+                sha256: "unknown".to_string(),
+            },
+            content_type: mime_type.to_string(),
+            encoding: Some("utf-8".to_string()),
+            meta: serde_json::Value::Null, // TODO: Extract metadata
+            text,
+            structured: None,
+            attachments: vec![],
+            warnings: vec![],
+            annotations: vec![],
+        })
+    }
+}
--- a/src/extractors/mod.rs
+++ b/src/extractors/mod.rs
@ -0,0 +1,8 @@
+pub mod text;
+pub mod html;
+pub mod structured_text;
+pub mod pdf;
+pub mod zim;
+pub mod office;
+pub mod email;
+pub mod archive;
--- a/src/extractors/office.rs
+++ b/src/extractors/office.rs
@ -0,0 +1,103 @@
+use crate::{A2j, Extractor, Source};
+use std::io::Read;
+use zip::ZipArchive;
+use calamine::{Reader, Xlsx};
+use quick_xml::events::Event;
+use quick_xml::Reader as XmlReader;
+
+pub struct OfficeExtractor;
+
+impl Extractor for OfficeExtractor {
+    fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
+        let mut buffer = Vec::new();
+        src.read_to_end(&mut buffer)?;
+        let cursor = std::io::Cursor::new(buffer.clone());
+
+        let text = match mime_type {
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
+                let mut archive = ZipArchive::new(cursor)?;
+                let mut doc = archive.by_name("word/document.xml")?;
+                let mut xml = String::new();
+                doc.read_to_string(&mut xml)?;
+
+                let mut reader = XmlReader::from_str(&xml);
+                reader.trim_text(true);
+
+                let mut txt = String::new();
+                loop {
+                    match reader.read_event() {
+                        Ok(Event::Start(e)) => {
+                            if e.name().as_ref() == b"w:t" {
+                                txt.push_str(&reader.read_text(e.name())?);
+                            }
+                        }
+                        Ok(Event::Eof) => break,
+                        Err(e) => return Err(e.into()),
+                        _ => (),
+                    }
+                }
+                Some(txt)
+            }
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
+                let mut archive = ZipArchive::new(cursor)?;
+                let mut text = String::new();
+                for i in 0..archive.len() {
+                    let mut file = archive.by_index(i)?;
+                    if file.name().starts_with("ppt/slides/slide") {
+                        let mut slide_text = String::new();
+                        file.read_to_string(&mut slide_text)?;
+
+                        let mut reader = XmlReader::from_str(&slide_text);
+                        reader.trim_text(true);
+
+                        loop {
+                            match reader.read_event() {
+                                Ok(Event::Start(e)) => {
+                                    if e.name().as_ref() == b"a:t" {
+                                        text.push_str(&reader.read_text(e.name())?);
+                                        text.push(' ');
+                                    }
+                                }
+                                Ok(Event::Eof) => break,
+                                Err(e) => return Err(e.into()),
+                                _ => (),
+                            }
+                        }
+                    }
+                }
+                Some(text)
+            }
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => {
+                let mut xlsx: Xlsx<_> = calamine::open_workbook_from_rs(cursor)?;
+                let mut text = String::new();
+                if let Some(Ok(range)) = xlsx.worksheet_range("Sheet1") {
+                    for row in range.rows() {
+                        for cell in row {
+                            text.push_str(&cell.to_string());
+                            text.push('\t');
+                        }
+                        text.push('\n');
+                    }
+                }
+                Some(text)
+            }
+            _ => None,
+        };
+
+        Ok(A2j {
+            source: Source {
+                path: "unknown".to_string(),
+                size: buffer.len() as u64,
+                sha256: "unknown".to_string(),
+            },
+            content_type: mime_type.to_string(),
+            encoding: None,
+            meta: serde_json::Value::Null, // TODO: Extract metadata
+            text,
+            structured: None,
+            attachments: vec![],
+            warnings: vec![],
+            annotations: vec![],
+        })
+    }
+}
--- a/src/extractors/pdf.rs
+++ b/src/extractors/pdf.rs
@ -0,0 +1,29 @@
+
+use crate::{A2j, Extractor, Source};
+use std::io::Read;
+
+pub struct PdfExtractor;
+
+impl Extractor for PdfExtractor {
+    fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
+        let mut buffer = Vec::new();
+        src.read_to_end(&mut buffer)?;
+        let text = pdf_extract::extract_text_from_mem(&buffer)?;
+
+        Ok(A2j {
+            source: Source {
+                path: "unknown".to_string(),
+                size: buffer.len() as u64,
+                sha256: "unknown".to_string(),
+            },
+            content_type: mime_type.to_string(),
+            encoding: None,
+            meta: serde_json::Value::Null, // TODO: Extract metadata
+            text: Some(text),
+            structured: None,
+            attachments: vec![],
+            warnings: vec![],
+            annotations: vec![],
+        })
+    }
+}
--- a/src/extractors/structured_text.rs
+++ b/src/extractors/structured_text.rs
@ -0,0 +1,37 @@
+
+use crate::{A2j, Extractor, Source};
+use serde_json::Value;
+use std::io::Read;
+
+pub struct StructuredTextExtractor;
+
+impl Extractor for StructuredTextExtractor {
+    fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
+        let mut buffer = String::new();
+        src.read_to_string(&mut buffer)?;
+
+        let structured = if mime_type == "application/json" {
+            serde_json::from_str(&buffer)?
+        } else {
+            // For XML, we'll just store the raw XML as a string in the structured field for now.
+            // A more advanced implementation could convert it to a JSON-like structure.
+            Value::String(buffer.clone())
+        };
+
+        Ok(A2j {
+            source: Source {
+                path: "unknown".to_string(),
+                size: buffer.len() as u64,
+                sha256: "unknown".to_string(),
+            },
+            content_type: mime_type.to_string(),
+            encoding: Some("utf-8".to_string()),
+            meta: serde_json::Value::Null,
+            text: None,
+            structured: Some(structured),
+            attachments: vec![],
+            warnings: vec![],
+            annotations: vec![],
+        })
+    }
+}
--- a/src/extractors/text.rs
+++ b/src/extractors/text.rs
@ -0,0 +1,29 @@
+
+use crate::{A2j, Extractor, Source};
+use std::io::Read;
+
+pub struct TextExtractor;
+
+impl Extractor for TextExtractor {
+    fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
+        let mut buffer = Vec::new();
+        src.read_to_end(&mut buffer)?;
+        let text = String::from_utf8_lossy(&buffer).to_string();
+
+        Ok(A2j {
+            source: Source {
+                path: "unknown".to_string(),
+                size: buffer.len() as u64,
+                sha256: "unknown".to_string(),
+            },
+            content_type: mime_type.to_string(),
+            encoding: Some("utf-8".to_string()),
+            meta: serde_json::Value::Null,
+            text: Some(text),
+            structured: None,
+            attachments: vec![],
+            warnings: vec![],
+            annotations: vec![],
+        })
+    }
+}
--- a/src/extractors/zim.rs
+++ b/src/extractors/zim.rs
@ -0,0 +1,83 @@
+
+use crate::{A2j, Extractor, Source};
+use std::io::{Read, Write};
+use std::process::Command;
+use tempfile::{tempdir, NamedTempFile};
+
+pub struct ZimExtractor;
+
+impl Extractor for ZimExtractor {
+    fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
+        let mut buffer = Vec::new();
+        src.read_to_end(&mut buffer)?;
+
+        // Check if zimdump is installed
+        if Command::new("zimdump").arg("--version").output().is_err() {
+            return Ok(A2j {
+                source: Source {
+                    path: "unknown".to_string(),
+                    size: buffer.len() as u64,
+                    sha256: "unknown".to_string(),
+                },
+                content_type: mime_type.to_string(),
+                encoding: None,
+                meta: serde_json::Value::Null,
+                text: None,
+                structured: None,
+                attachments: vec![],
+                warnings: vec!["zimdump tool not found. Please install it to process ZIM files.".to_string()],
+                annotations: vec![],
+            });
+        }
+
+        let mut temp_file = NamedTempFile::new()?;
+        temp_file.write_all(&buffer)?;
+
+        let temp_dir = tempdir()?;
+
+        let output = Command::new("zimdump")
+            .arg("dump")
+            .arg(format!("--dir={}", temp_dir.path().to_str().unwrap()))
+            .arg(temp_file.path())
+            .output()?;
+
+        if !output.status.success() {
+            return Err(anyhow::anyhow!("zimdump failed with status: {}\n{}", output.status, String::from_utf8_lossy(&output.stderr)));
+        }
+
+
+        let main_page_path = temp_dir.path().join("index.html");
+        let text = if main_page_path.exists() {
+            Some(std::fs::read_to_string(main_page_path)?)
+        } else {
+            let main_page_path = temp_dir.path().join("index");
+            if main_page_path.exists() {
+                Some(std::fs::read_to_string(main_page_path)?)
+            } else {
+                // If index.html doesn't exist, try to find the first html file
+                let mut html_files = glob::glob(temp_dir.path().join("**/*.html").to_str().unwrap())?;
+                if let Some(Ok(path)) = html_files.next() {
+                    Some(std::fs::read_to_string(path)?)
+                } else {
+                    None
+                }
+            }
+        };
+
+        Ok(A2j {
+            source: Source {
+                path: "unknown".to_string(),
+                size: buffer.len() as u64,
+                sha256: "unknown".to_string(),
+            },
+            content_type: mime_type.to_string(),
+            encoding: None,
+            meta: serde_json::Value::Null, // TODO: Extract metadata
+            text,
+            structured: None,
+            attachments: vec![],
+            warnings: vec![],
+            annotations: vec![],
+        })
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,209 @@
+
+use serde::{Deserialize, Serialize};
+use std::io::Read;
+use clap::Parser;
+use std::fs::File;
+use std::io::BufReader;
+use std::collections::HashMap;
+use anyhow::Context;
+use regex::Regex;
+
+mod extractors;
+use extractors::text::TextExtractor;
+use extractors::html::HtmlExtractor;
+use extractors::structured_text::StructuredTextExtractor;
+use extractors::pdf::PdfExtractor;
+use extractors::zim::ZimExtractor;
+use extractors::office::OfficeExtractor;
+use extractors::email::EmailExtractor;
+use extractors::archive::ArchiveExtractor;
+
+/// Represents a single annotation found in the text.
+#[derive(Serialize, Debug)]
+pub struct Annotation {
+    pub r#type: String,
+    pub value: String,
+    pub pattern_id: String,
+    pub start_offset: Option<usize>,
+    pub end_offset: Option<usize>,
+}
+
+/// Represents a single pattern configuration for annotation.
+#[derive(Deserialize, Debug)]
+pub struct PatternConfig {
+    pub id: String,
+    pub regex: String,
+    pub r#type: String,
+    pub case_sensitive: Option<bool>,
+    pub description: Option<String>,
+}
+
+/// Common result object serialized to JSON.
+#[derive(Serialize)]
+pub struct A2j {
+  pub source: Source,
+  pub content_type: String,
+  pub encoding: Option<String>,
+  pub meta: serde_json::Value,
+  pub text: Option<String>,
+  pub structured: Option<serde_json::Value>,
+  pub attachments: Vec<A2j>,
+  pub warnings: Vec<String>,
+  pub annotations: Vec<Annotation>,
+}
+
+#[derive(Serialize)]
+pub struct Source {
+    pub path: String,
+    pub size: u64,
+    pub sha256: String,
+}
+
+pub trait Extractor: Send + Sync {
+  fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j>;
+}
+
+#[derive(Parser)]
+#[command(author, version, about, long_about = None)]
+struct Cli {
+    /// The path to the file to process, or - for stdin
+    #[arg(value_name = "PATH")]
+    path: String,
+
+    /// Pretty print the output JSON
+    #[arg(short, long)]
+    pretty: bool,
+
+    /// Optional path to a JSON file containing regex patterns for annotation.
+    #[arg(long)]
+    annotations_config: Option<String>,
+}
+
+fn apply_patterns(text: &str, patterns: &[PatternConfig]) -> anyhow::Result<Vec<Annotation>> {
+    let mut annotations = Vec::new();
+    for config in patterns {
+        let regex = if config.case_sensitive.unwrap_or(true) {
+            Regex::new(&config.regex)?
+        } else {
+            Regex::new(&format!("(?i){}", config.regex))?
+        };
+
+        for mat in regex.find_iter(text) {
+            annotations.push(Annotation {
+                r#type: config.r#type.clone(),
+                value: mat.as_str().to_string(),
+                pattern_id: config.id.clone(),
+                start_offset: Some(mat.start()),
+                end_offset: Some(mat.end()),
+            });
+        }
+    }
+    Ok(annotations)
+}
+
+fn load_patterns(config_path: &str) -> anyhow::Result<Vec<PatternConfig>> {
+    let file = File::open(config_path).with_context(|| format!("Failed to open annotations config file: {}", config_path))?;
+    let reader = BufReader::new(file);
+    let patterns: Vec<PatternConfig> = serde_json::from_reader(reader).with_context(|| "Failed to deserialize annotations config")?;
+    Ok(patterns)
+}
+
+fn main() -> anyhow::Result<()> {
+    let cli = Cli::parse();
+
+    let mut registry: HashMap<&'static str, Box<dyn Extractor>> = HashMap::new();
+    registry.insert("text/plain", Box::new(TextExtractor));
+    registry.insert("text/html", Box::new(HtmlExtractor));
+    registry.insert("application/json", Box::new(StructuredTextExtractor));
+    registry.insert("text/xml", Box::new(StructuredTextExtractor));
+    registry.insert("application/pdf", Box::new(PdfExtractor));
+    registry.insert("application/x-zim", Box::new(ZimExtractor));
+    registry.insert("application/vnd.openxmlformats-officedocument.wordprocessingml.document", Box::new(OfficeExtractor));
+    registry.insert("application/vnd.openxmlformats-officedocument.presentationml.presentation", Box::new(OfficeExtractor));
+    registry.insert("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", Box::new(OfficeExtractor));
+    registry.insert("message/rfc822", Box::new(EmailExtractor));
+    registry.insert("application/zip", Box::new(ArchiveExtractor));
+    registry.insert("application/gzip", Box::new(ArchiveExtractor));
+    registry.insert("application/x-tar", Box::new(ArchiveExtractor));
+
+    let mut input: Box<dyn Read> = if cli.path == "-" {
+        Box::new(BufReader::new(std::io::stdin()))
+    } else {
+        Box::new(BufReader::new(File::open(&cli.path).with_context(|| format!("Failed to open file: {}", cli.path))?))
+    };
+
+    let mut buffer = Vec::new();
+    input.read_to_end(&mut buffer)?;
+    
+    let mime_type = if buffer.starts_with(b"ZIM\x04") {
+        "application/x-zim".to_string()
+    } else if buffer.starts_with(b"\x50\x4B\x03\x04") {
+        let path = cli.path.to_lowercase();
+        if path.ends_with(".docx") {
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string()
+        } else if path.ends_with(".pptx") {
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation".to_string()
+        } else if path.ends_with(".xlsx") {
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string()
+        } else {
+            match infer::get(&buffer) {
+                Some(kind) => kind.mime_type().to_string(),
+                None => mime_guess::from_path(&cli.path).first_or_text_plain().to_string(),
+            }
+        }
+    } else {
+        match infer::get(&buffer) {
+            Some(kind) => kind.mime_type().to_string(),
+            None => mime_guess::from_path(&cli.path).first_or_text_plain().to_string(),
+        }
+    };
+
+    let fallback_extractor: Box<dyn Extractor> = Box::new(FallbackExtractor);
+    let extractor = registry.get(mime_type.as_str()).unwrap_or(&fallback_extractor);
+
+    let mut result = extractor.extract(&mut std::io::Cursor::new(buffer.clone()), &mime_type)?;
+    result.source.path = cli.path.clone();
+    result.source.size = buffer.len() as u64;
+
+    if let Some(config_path) = cli.annotations_config {
+        let patterns = load_patterns(&config_path)?;
+        if let Some(text) = &result.text {
+            let annotations = apply_patterns(text, &patterns)?;
+            result.annotations = annotations;
+        }
+    }
+
+    let output = if cli.pretty {
+        serde_json::to_string_pretty(&result)?
+    } else {
+        serde_json::to_string(&result)?
+    };
+
+    println!("{}", output);
+
+    Ok(())
+}
+
+struct FallbackExtractor;
+impl Extractor for FallbackExtractor {
+    fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
+        let mut buffer = Vec::new();
+        src.read_to_end(&mut buffer)?;
+
+        Ok(A2j {
+            source: Source {
+                path: "unknown".to_string(),
+                size: buffer.len() as u64,
+                sha256: "unknown".to_string(),
+            },
+            content_type: mime_type.to_string(),
+            encoding: None,
+            meta: serde_json::Value::Null,
+            text: None,
+            structured: None,
+            attachments: vec![],
+            warnings: vec!["No extractor found for this file type".to_string()],
+            annotations: vec![],
+        })
+    }
+}