feat: Initial commit of anything2json

This commit is contained in:
Gemini 2025-10-29 04:30:14 +01:00
parent 2ba1121c01
commit 4c71bec26a
12 changed files with 691 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
Cargo.lock
target

31
Cargo.toml Normal file
View File

@ -0,0 +1,31 @@
[package]
name = "a2j"
version = "0.1.0"
edition = "2024"
[dependencies]
regex = "1.10"
glob = "0.3"
tempfile = "3.10"
mime_guess = "2.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
anyhow = "1.0"
thiserror = "1.0"
tracing = "0.1"
infer = "0.15"
chardetng = "0.1"
encoding_rs = "0.8"
content_inspector = "0.2"
scraper = "0.19"
quick-xml = { version = "0.31", features = ["serialize"] }
pdf-extract = "0.7"
zip = "0.6"
calamine = "0.22"
kamadak-exif = "0.5"
symphonia = "0.5"
mailparse = "0.14"
flate2 = { version = "1.0", features = ["miniz_oxide"], default-features = false }
tar = "0.4"
goblin = "0.7"
clap = { version = "4.4", features = ["derive"] }

89
src/extractors/archive.rs Normal file
View File

@ -0,0 +1,89 @@
use crate::{A2j, Extractor, Source};
use std::io::Read;
use zip::ZipArchive;
use tar::Archive;
use flate2::read::GzDecoder;
pub struct ArchiveExtractor;
impl Extractor for ArchiveExtractor {
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
let mut buffer = Vec::new();
src.read_to_end(&mut buffer)?;
let mut attachments = Vec::new();
let mut warnings = Vec::new();
match mime_type {
"application/zip" => {
let cursor = std::io::Cursor::new(buffer.clone());
let mut archive = ZipArchive::new(cursor)?;
for i in 0..archive.len() {
let file = archive.by_index(i)?;
if file.is_file() {
attachments.push(A2j {
source: Source {
path: file.name().to_string(),
size: file.size(),
sha256: "unknown".to_string(),
},
content_type: "application/octet-stream".to_string(), // Placeholder
encoding: None,
meta: serde_json::Value::Null,
text: None,
structured: None,
attachments: vec![],
warnings: vec!["Recursive extraction not fully implemented".to_string()],
annotations: vec![],
});
}
}
}
"application/gzip" | "application/x-tar" => {
let cursor = std::io::Cursor::new(buffer.clone());
let decoder = GzDecoder::new(cursor);
let mut archive = Archive::new(decoder);
for file in archive.entries()? {
let file = file?;
if file.header().entry_type().is_file() {
attachments.push(A2j {
source: Source {
path: file.path()?.to_str().unwrap_or("unknown").to_string(),
size: file.size(),
sha256: "unknown".to_string(),
},
content_type: "application/octet-stream".to_string(), // Placeholder
encoding: None,
meta: serde_json::Value::Null,
text: None,
structured: None,
attachments: vec![],
warnings: vec!["Recursive extraction not fully implemented".to_string()],
annotations: vec![],
});
}
}
}
_ => warnings.push(format!("Unsupported archive type: {}", mime_type)),
}
Ok(A2j {
source: Source {
path: "unknown".to_string(),
size: buffer.len() as u64,
sha256: "unknown".to_string(),
},
content_type: mime_type.to_string(),
encoding: None,
meta: serde_json::Value::Null,
text: None,
structured: None,
attachments,
warnings,
annotations: vec![],
})
}
}

39
src/extractors/email.rs Normal file
View File

@ -0,0 +1,39 @@
use crate::{A2j, Extractor, Source};
use std::io::Read;
use mailparse::parse_mail;
use serde_json::json;
pub struct EmailExtractor;
impl Extractor for EmailExtractor {
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
let mut buffer = Vec::new();
src.read_to_end(&mut buffer)?;
let mail = parse_mail(&buffer)?;
let mut headers = json!({});
for header in &mail.headers {
headers[&header.get_key().to_lowercase()] = json!(header.get_value());
}
let text_body = mail.get_body().ok();
Ok(A2j {
source: Source {
path: "unknown".to_string(),
size: buffer.len() as u64,
sha256: "unknown".to_string(),
},
content_type: mime_type.to_string(),
encoding: None, // mailparse handles encoding internally
meta: headers,
text: text_body,
structured: None,
attachments: vec![], // TODO: Handle attachments
warnings: vec![],
annotations: vec![],
})
}
}

32
src/extractors/html.rs Normal file
View File

@ -0,0 +1,32 @@
use crate::{A2j, Extractor, Source};
use scraper::{Html, Selector};
use std::io::Read;
pub struct HtmlExtractor;
impl Extractor for HtmlExtractor {
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
let mut buffer = String::new();
src.read_to_string(&mut buffer)?;
let document = Html::parse_document(&buffer);
let selector = Selector::parse("body").unwrap();
let text = document.select(&selector).next().map(|e| e.text().collect::<String>());
Ok(A2j {
source: Source {
path: "unknown".to_string(),
size: buffer.len() as u64,
sha256: "unknown".to_string(),
},
content_type: mime_type.to_string(),
encoding: Some("utf-8".to_string()),
meta: serde_json::Value::Null, // TODO: Extract metadata
text,
structured: None,
attachments: vec![],
warnings: vec![],
annotations: vec![],
})
}
}

8
src/extractors/mod.rs Normal file
View File

@ -0,0 +1,8 @@
pub mod text;
pub mod html;
pub mod structured_text;
pub mod pdf;
pub mod zim;
pub mod office;
pub mod email;
pub mod archive;

103
src/extractors/office.rs Normal file
View File

@ -0,0 +1,103 @@
use crate::{A2j, Extractor, Source};
use std::io::Read;
use zip::ZipArchive;
use calamine::{Reader, Xlsx};
use quick_xml::events::Event;
use quick_xml::Reader as XmlReader;
pub struct OfficeExtractor;
impl Extractor for OfficeExtractor {
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
let mut buffer = Vec::new();
src.read_to_end(&mut buffer)?;
let cursor = std::io::Cursor::new(buffer.clone());
let text = match mime_type {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
let mut archive = ZipArchive::new(cursor)?;
let mut doc = archive.by_name("word/document.xml")?;
let mut xml = String::new();
doc.read_to_string(&mut xml)?;
let mut reader = XmlReader::from_str(&xml);
reader.trim_text(true);
let mut txt = String::new();
loop {
match reader.read_event() {
Ok(Event::Start(e)) => {
if e.name().as_ref() == b"w:t" {
txt.push_str(&reader.read_text(e.name())?);
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(e.into()),
_ => (),
}
}
Some(txt)
}
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
let mut archive = ZipArchive::new(cursor)?;
let mut text = String::new();
for i in 0..archive.len() {
let mut file = archive.by_index(i)?;
if file.name().starts_with("ppt/slides/slide") {
let mut slide_text = String::new();
file.read_to_string(&mut slide_text)?;
let mut reader = XmlReader::from_str(&slide_text);
reader.trim_text(true);
loop {
match reader.read_event() {
Ok(Event::Start(e)) => {
if e.name().as_ref() == b"a:t" {
text.push_str(&reader.read_text(e.name())?);
text.push(' ');
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(e.into()),
_ => (),
}
}
}
}
Some(text)
}
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => {
let mut xlsx: Xlsx<_> = calamine::open_workbook_from_rs(cursor)?;
let mut text = String::new();
if let Some(Ok(range)) = xlsx.worksheet_range("Sheet1") {
for row in range.rows() {
for cell in row {
text.push_str(&cell.to_string());
text.push('\t');
}
text.push('\n');
}
}
Some(text)
}
_ => None,
};
Ok(A2j {
source: Source {
path: "unknown".to_string(),
size: buffer.len() as u64,
sha256: "unknown".to_string(),
},
content_type: mime_type.to_string(),
encoding: None,
meta: serde_json::Value::Null, // TODO: Extract metadata
text,
structured: None,
attachments: vec![],
warnings: vec![],
annotations: vec![],
})
}
}

29
src/extractors/pdf.rs Normal file
View File

@ -0,0 +1,29 @@
use crate::{A2j, Extractor, Source};
use std::io::Read;
pub struct PdfExtractor;
impl Extractor for PdfExtractor {
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
let mut buffer = Vec::new();
src.read_to_end(&mut buffer)?;
let text = pdf_extract::extract_text_from_mem(&buffer)?;
Ok(A2j {
source: Source {
path: "unknown".to_string(),
size: buffer.len() as u64,
sha256: "unknown".to_string(),
},
content_type: mime_type.to_string(),
encoding: None,
meta: serde_json::Value::Null, // TODO: Extract metadata
text: Some(text),
structured: None,
attachments: vec![],
warnings: vec![],
annotations: vec![],
})
}
}

View File

@ -0,0 +1,37 @@
use crate::{A2j, Extractor, Source};
use serde_json::Value;
use std::io::Read;
pub struct StructuredTextExtractor;
impl Extractor for StructuredTextExtractor {
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
let mut buffer = String::new();
src.read_to_string(&mut buffer)?;
let structured = if mime_type == "application/json" {
serde_json::from_str(&buffer)?
} else {
// For XML, we'll just store the raw XML as a string in the structured field for now.
// A more advanced implementation could convert it to a JSON-like structure.
Value::String(buffer.clone())
};
Ok(A2j {
source: Source {
path: "unknown".to_string(),
size: buffer.len() as u64,
sha256: "unknown".to_string(),
},
content_type: mime_type.to_string(),
encoding: Some("utf-8".to_string()),
meta: serde_json::Value::Null,
text: None,
structured: Some(structured),
attachments: vec![],
warnings: vec![],
annotations: vec![],
})
}
}

29
src/extractors/text.rs Normal file
View File

@ -0,0 +1,29 @@
use crate::{A2j, Extractor, Source};
use std::io::Read;
pub struct TextExtractor;
impl Extractor for TextExtractor {
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
let mut buffer = Vec::new();
src.read_to_end(&mut buffer)?;
let text = String::from_utf8_lossy(&buffer).to_string();
Ok(A2j {
source: Source {
path: "unknown".to_string(),
size: buffer.len() as u64,
sha256: "unknown".to_string(),
},
content_type: mime_type.to_string(),
encoding: Some("utf-8".to_string()),
meta: serde_json::Value::Null,
text: Some(text),
structured: None,
attachments: vec![],
warnings: vec![],
annotations: vec![],
})
}
}

83
src/extractors/zim.rs Normal file
View File

@ -0,0 +1,83 @@
use crate::{A2j, Extractor, Source};
use std::io::{Read, Write};
use std::process::Command;
use tempfile::{tempdir, NamedTempFile};
pub struct ZimExtractor;
impl Extractor for ZimExtractor {
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
let mut buffer = Vec::new();
src.read_to_end(&mut buffer)?;
// Check if zimdump is installed
if Command::new("zimdump").arg("--version").output().is_err() {
return Ok(A2j {
source: Source {
path: "unknown".to_string(),
size: buffer.len() as u64,
sha256: "unknown".to_string(),
},
content_type: mime_type.to_string(),
encoding: None,
meta: serde_json::Value::Null,
text: None,
structured: None,
attachments: vec![],
warnings: vec!["zimdump tool not found. Please install it to process ZIM files.".to_string()],
annotations: vec![],
});
}
let mut temp_file = NamedTempFile::new()?;
temp_file.write_all(&buffer)?;
let temp_dir = tempdir()?;
let output = Command::new("zimdump")
.arg("dump")
.arg(format!("--dir={}", temp_dir.path().to_str().unwrap()))
.arg(temp_file.path())
.output()?;
if !output.status.success() {
return Err(anyhow::anyhow!("zimdump failed with status: {}\n{}", output.status, String::from_utf8_lossy(&output.stderr)));
}
let main_page_path = temp_dir.path().join("index.html");
let text = if main_page_path.exists() {
Some(std::fs::read_to_string(main_page_path)?)
} else {
let main_page_path = temp_dir.path().join("index");
if main_page_path.exists() {
Some(std::fs::read_to_string(main_page_path)?)
} else {
// If index.html doesn't exist, try to find the first html file
let mut html_files = glob::glob(temp_dir.path().join("**/*.html").to_str().unwrap())?;
if let Some(Ok(path)) = html_files.next() {
Some(std::fs::read_to_string(path)?)
} else {
None
}
}
};
Ok(A2j {
source: Source {
path: "unknown".to_string(),
size: buffer.len() as u64,
sha256: "unknown".to_string(),
},
content_type: mime_type.to_string(),
encoding: None,
meta: serde_json::Value::Null, // TODO: Extract metadata
text,
structured: None,
attachments: vec![],
warnings: vec![],
annotations: vec![],
})
}
}

209
src/main.rs Normal file
View File

@ -0,0 +1,209 @@
use serde::{Deserialize, Serialize};
use std::io::Read;
use clap::Parser;
use std::fs::File;
use std::io::BufReader;
use std::collections::HashMap;
use anyhow::Context;
use regex::Regex;
mod extractors;
use extractors::text::TextExtractor;
use extractors::html::HtmlExtractor;
use extractors::structured_text::StructuredTextExtractor;
use extractors::pdf::PdfExtractor;
use extractors::zim::ZimExtractor;
use extractors::office::OfficeExtractor;
use extractors::email::EmailExtractor;
use extractors::archive::ArchiveExtractor;
/// Represents a single annotation found in the text.
#[derive(Serialize, Debug)]
pub struct Annotation {
pub r#type: String,
pub value: String,
pub pattern_id: String,
pub start_offset: Option<usize>,
pub end_offset: Option<usize>,
}
/// Represents a single pattern configuration for annotation.
#[derive(Deserialize, Debug)]
pub struct PatternConfig {
pub id: String,
pub regex: String,
pub r#type: String,
pub case_sensitive: Option<bool>,
pub description: Option<String>,
}
/// Common result object serialized to JSON.
#[derive(Serialize)]
pub struct A2j {
pub source: Source,
pub content_type: String,
pub encoding: Option<String>,
pub meta: serde_json::Value,
pub text: Option<String>,
pub structured: Option<serde_json::Value>,
pub attachments: Vec<A2j>,
pub warnings: Vec<String>,
pub annotations: Vec<Annotation>,
}
#[derive(Serialize)]
pub struct Source {
pub path: String,
pub size: u64,
pub sha256: String,
}
pub trait Extractor: Send + Sync {
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j>;
}
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
struct Cli {
/// The path to the file to process, or - for stdin
#[arg(value_name = "PATH")]
path: String,
/// Pretty print the output JSON
#[arg(short, long)]
pretty: bool,
/// Optional path to a JSON file containing regex patterns for annotation.
#[arg(long)]
annotations_config: Option<String>,
}
fn apply_patterns(text: &str, patterns: &[PatternConfig]) -> anyhow::Result<Vec<Annotation>> {
let mut annotations = Vec::new();
for config in patterns {
let regex = if config.case_sensitive.unwrap_or(true) {
Regex::new(&config.regex)?
} else {
Regex::new(&format!("(?i){}", config.regex))?
};
for mat in regex.find_iter(text) {
annotations.push(Annotation {
r#type: config.r#type.clone(),
value: mat.as_str().to_string(),
pattern_id: config.id.clone(),
start_offset: Some(mat.start()),
end_offset: Some(mat.end()),
});
}
}
Ok(annotations)
}
fn load_patterns(config_path: &str) -> anyhow::Result<Vec<PatternConfig>> {
let file = File::open(config_path).with_context(|| format!("Failed to open annotations config file: {}", config_path))?;
let reader = BufReader::new(file);
let patterns: Vec<PatternConfig> = serde_json::from_reader(reader).with_context(|| "Failed to deserialize annotations config")?;
Ok(patterns)
}
fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
let mut registry: HashMap<&'static str, Box<dyn Extractor>> = HashMap::new();
registry.insert("text/plain", Box::new(TextExtractor));
registry.insert("text/html", Box::new(HtmlExtractor));
registry.insert("application/json", Box::new(StructuredTextExtractor));
registry.insert("text/xml", Box::new(StructuredTextExtractor));
registry.insert("application/pdf", Box::new(PdfExtractor));
registry.insert("application/x-zim", Box::new(ZimExtractor));
registry.insert("application/vnd.openxmlformats-officedocument.wordprocessingml.document", Box::new(OfficeExtractor));
registry.insert("application/vnd.openxmlformats-officedocument.presentationml.presentation", Box::new(OfficeExtractor));
registry.insert("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", Box::new(OfficeExtractor));
registry.insert("message/rfc822", Box::new(EmailExtractor));
registry.insert("application/zip", Box::new(ArchiveExtractor));
registry.insert("application/gzip", Box::new(ArchiveExtractor));
registry.insert("application/x-tar", Box::new(ArchiveExtractor));
let mut input: Box<dyn Read> = if cli.path == "-" {
Box::new(BufReader::new(std::io::stdin()))
} else {
Box::new(BufReader::new(File::open(&cli.path).with_context(|| format!("Failed to open file: {}", cli.path))?))
};
let mut buffer = Vec::new();
input.read_to_end(&mut buffer)?;
let mime_type = if buffer.starts_with(b"ZIM\x04") {
"application/x-zim".to_string()
} else if buffer.starts_with(b"\x50\x4B\x03\x04") {
let path = cli.path.to_lowercase();
if path.ends_with(".docx") {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string()
} else if path.ends_with(".pptx") {
"application/vnd.openxmlformats-officedocument.presentationml.presentation".to_string()
} else if path.ends_with(".xlsx") {
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string()
} else {
match infer::get(&buffer) {
Some(kind) => kind.mime_type().to_string(),
None => mime_guess::from_path(&cli.path).first_or_text_plain().to_string(),
}
}
} else {
match infer::get(&buffer) {
Some(kind) => kind.mime_type().to_string(),
None => mime_guess::from_path(&cli.path).first_or_text_plain().to_string(),
}
};
let fallback_extractor: Box<dyn Extractor> = Box::new(FallbackExtractor);
let extractor = registry.get(mime_type.as_str()).unwrap_or(&fallback_extractor);
let mut result = extractor.extract(&mut std::io::Cursor::new(buffer.clone()), &mime_type)?;
result.source.path = cli.path.clone();
result.source.size = buffer.len() as u64;
if let Some(config_path) = cli.annotations_config {
let patterns = load_patterns(&config_path)?;
if let Some(text) = &result.text {
let annotations = apply_patterns(text, &patterns)?;
result.annotations = annotations;
}
}
let output = if cli.pretty {
serde_json::to_string_pretty(&result)?
} else {
serde_json::to_string(&result)?
};
println!("{}", output);
Ok(())
}
struct FallbackExtractor;
impl Extractor for FallbackExtractor {
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
let mut buffer = Vec::new();
src.read_to_end(&mut buffer)?;
Ok(A2j {
source: Source {
path: "unknown".to_string(),
size: buffer.len() as u64,
sha256: "unknown".to_string(),
},
content_type: mime_type.to_string(),
encoding: None,
meta: serde_json::Value::Null,
text: None,
structured: None,
attachments: vec![],
warnings: vec!["No extractor found for this file type".to_string()],
annotations: vec![],
})
}
}