feat: Initial commit of anything2json
This commit is contained in:
parent
2ba1121c01
commit
4c71bec26a
|
|
@ -0,0 +1,2 @@
|
|||
Cargo.lock
|
||||
‘target’
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
[package]
|
||||
name = "a2j"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
regex = "1.10"
|
||||
glob = "0.3"
|
||||
tempfile = "3.10"
|
||||
mime_guess = "2.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
anyhow = "1.0"
|
||||
thiserror = "1.0"
|
||||
tracing = "0.1"
|
||||
infer = "0.15"
|
||||
chardetng = "0.1"
|
||||
encoding_rs = "0.8"
|
||||
content_inspector = "0.2"
|
||||
scraper = "0.19"
|
||||
quick-xml = { version = "0.31", features = ["serialize"] }
|
||||
pdf-extract = "0.7"
|
||||
zip = "0.6"
|
||||
calamine = "0.22"
|
||||
kamadak-exif = "0.5"
|
||||
symphonia = "0.5"
|
||||
mailparse = "0.14"
|
||||
flate2 = { version = "1.0", features = ["miniz_oxide"], default-features = false }
|
||||
tar = "0.4"
|
||||
goblin = "0.7"
|
||||
clap = { version = "4.4", features = ["derive"] }
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
|
||||
use crate::{A2j, Extractor, Source};
|
||||
use std::io::Read;
|
||||
use zip::ZipArchive;
|
||||
use tar::Archive;
|
||||
use flate2::read::GzDecoder;
|
||||
|
||||
pub struct ArchiveExtractor;
|
||||
|
||||
impl Extractor for ArchiveExtractor {
|
||||
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
|
||||
let mut buffer = Vec::new();
|
||||
src.read_to_end(&mut buffer)?;
|
||||
|
||||
let mut attachments = Vec::new();
|
||||
let mut warnings = Vec::new();
|
||||
|
||||
match mime_type {
|
||||
"application/zip" => {
|
||||
let cursor = std::io::Cursor::new(buffer.clone());
|
||||
let mut archive = ZipArchive::new(cursor)?;
|
||||
|
||||
for i in 0..archive.len() {
|
||||
let file = archive.by_index(i)?;
|
||||
if file.is_file() {
|
||||
attachments.push(A2j {
|
||||
source: Source {
|
||||
path: file.name().to_string(),
|
||||
size: file.size(),
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: "application/octet-stream".to_string(), // Placeholder
|
||||
encoding: None,
|
||||
meta: serde_json::Value::Null,
|
||||
text: None,
|
||||
structured: None,
|
||||
attachments: vec![],
|
||||
warnings: vec!["Recursive extraction not fully implemented".to_string()],
|
||||
annotations: vec![],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
"application/gzip" | "application/x-tar" => {
|
||||
let cursor = std::io::Cursor::new(buffer.clone());
|
||||
let decoder = GzDecoder::new(cursor);
|
||||
let mut archive = Archive::new(decoder);
|
||||
|
||||
for file in archive.entries()? {
|
||||
let file = file?;
|
||||
if file.header().entry_type().is_file() {
|
||||
attachments.push(A2j {
|
||||
source: Source {
|
||||
path: file.path()?.to_str().unwrap_or("unknown").to_string(),
|
||||
size: file.size(),
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: "application/octet-stream".to_string(), // Placeholder
|
||||
encoding: None,
|
||||
meta: serde_json::Value::Null,
|
||||
text: None,
|
||||
structured: None,
|
||||
attachments: vec![],
|
||||
warnings: vec!["Recursive extraction not fully implemented".to_string()],
|
||||
annotations: vec![],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => warnings.push(format!("Unsupported archive type: {}", mime_type)),
|
||||
}
|
||||
|
||||
Ok(A2j {
|
||||
source: Source {
|
||||
path: "unknown".to_string(),
|
||||
size: buffer.len() as u64,
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: mime_type.to_string(),
|
||||
encoding: None,
|
||||
meta: serde_json::Value::Null,
|
||||
text: None,
|
||||
structured: None,
|
||||
attachments,
|
||||
warnings,
|
||||
annotations: vec![],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
|
||||
use crate::{A2j, Extractor, Source};
|
||||
use std::io::Read;
|
||||
use mailparse::parse_mail;
|
||||
use serde_json::json;
|
||||
|
||||
pub struct EmailExtractor;
|
||||
|
||||
impl Extractor for EmailExtractor {
|
||||
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
|
||||
let mut buffer = Vec::new();
|
||||
src.read_to_end(&mut buffer)?;
|
||||
|
||||
let mail = parse_mail(&buffer)?;
|
||||
|
||||
let mut headers = json!({});
|
||||
for header in &mail.headers {
|
||||
headers[&header.get_key().to_lowercase()] = json!(header.get_value());
|
||||
}
|
||||
|
||||
let text_body = mail.get_body().ok();
|
||||
|
||||
Ok(A2j {
|
||||
source: Source {
|
||||
path: "unknown".to_string(),
|
||||
size: buffer.len() as u64,
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: mime_type.to_string(),
|
||||
encoding: None, // mailparse handles encoding internally
|
||||
meta: headers,
|
||||
text: text_body,
|
||||
structured: None,
|
||||
attachments: vec![], // TODO: Handle attachments
|
||||
warnings: vec![],
|
||||
annotations: vec![],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
|
||||
use crate::{A2j, Extractor, Source};
|
||||
use scraper::{Html, Selector};
|
||||
use std::io::Read;
|
||||
|
||||
pub struct HtmlExtractor;
|
||||
|
||||
impl Extractor for HtmlExtractor {
|
||||
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
|
||||
let mut buffer = String::new();
|
||||
src.read_to_string(&mut buffer)?;
|
||||
let document = Html::parse_document(&buffer);
|
||||
let selector = Selector::parse("body").unwrap();
|
||||
let text = document.select(&selector).next().map(|e| e.text().collect::<String>());
|
||||
|
||||
Ok(A2j {
|
||||
source: Source {
|
||||
path: "unknown".to_string(),
|
||||
size: buffer.len() as u64,
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: mime_type.to_string(),
|
||||
encoding: Some("utf-8".to_string()),
|
||||
meta: serde_json::Value::Null, // TODO: Extract metadata
|
||||
text,
|
||||
structured: None,
|
||||
attachments: vec![],
|
||||
warnings: vec![],
|
||||
annotations: vec![],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
pub mod text;
|
||||
pub mod html;
|
||||
pub mod structured_text;
|
||||
pub mod pdf;
|
||||
pub mod zim;
|
||||
pub mod office;
|
||||
pub mod email;
|
||||
pub mod archive;
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
use crate::{A2j, Extractor, Source};
|
||||
use std::io::Read;
|
||||
use zip::ZipArchive;
|
||||
use calamine::{Reader, Xlsx};
|
||||
use quick_xml::events::Event;
|
||||
use quick_xml::Reader as XmlReader;
|
||||
|
||||
pub struct OfficeExtractor;
|
||||
|
||||
impl Extractor for OfficeExtractor {
|
||||
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
|
||||
let mut buffer = Vec::new();
|
||||
src.read_to_end(&mut buffer)?;
|
||||
let cursor = std::io::Cursor::new(buffer.clone());
|
||||
|
||||
let text = match mime_type {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
|
||||
let mut archive = ZipArchive::new(cursor)?;
|
||||
let mut doc = archive.by_name("word/document.xml")?;
|
||||
let mut xml = String::new();
|
||||
doc.read_to_string(&mut xml)?;
|
||||
|
||||
let mut reader = XmlReader::from_str(&xml);
|
||||
reader.trim_text(true);
|
||||
|
||||
let mut txt = String::new();
|
||||
loop {
|
||||
match reader.read_event() {
|
||||
Ok(Event::Start(e)) => {
|
||||
if e.name().as_ref() == b"w:t" {
|
||||
txt.push_str(&reader.read_text(e.name())?);
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => return Err(e.into()),
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
Some(txt)
|
||||
}
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
|
||||
let mut archive = ZipArchive::new(cursor)?;
|
||||
let mut text = String::new();
|
||||
for i in 0..archive.len() {
|
||||
let mut file = archive.by_index(i)?;
|
||||
if file.name().starts_with("ppt/slides/slide") {
|
||||
let mut slide_text = String::new();
|
||||
file.read_to_string(&mut slide_text)?;
|
||||
|
||||
let mut reader = XmlReader::from_str(&slide_text);
|
||||
reader.trim_text(true);
|
||||
|
||||
loop {
|
||||
match reader.read_event() {
|
||||
Ok(Event::Start(e)) => {
|
||||
if e.name().as_ref() == b"a:t" {
|
||||
text.push_str(&reader.read_text(e.name())?);
|
||||
text.push(' ');
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(e) => return Err(e.into()),
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(text)
|
||||
}
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => {
|
||||
let mut xlsx: Xlsx<_> = calamine::open_workbook_from_rs(cursor)?;
|
||||
let mut text = String::new();
|
||||
if let Some(Ok(range)) = xlsx.worksheet_range("Sheet1") {
|
||||
for row in range.rows() {
|
||||
for cell in row {
|
||||
text.push_str(&cell.to_string());
|
||||
text.push('\t');
|
||||
}
|
||||
text.push('\n');
|
||||
}
|
||||
}
|
||||
Some(text)
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
|
||||
Ok(A2j {
|
||||
source: Source {
|
||||
path: "unknown".to_string(),
|
||||
size: buffer.len() as u64,
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: mime_type.to_string(),
|
||||
encoding: None,
|
||||
meta: serde_json::Value::Null, // TODO: Extract metadata
|
||||
text,
|
||||
structured: None,
|
||||
attachments: vec![],
|
||||
warnings: vec![],
|
||||
annotations: vec![],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
use crate::{A2j, Extractor, Source};
|
||||
use std::io::Read;
|
||||
|
||||
pub struct PdfExtractor;
|
||||
|
||||
impl Extractor for PdfExtractor {
|
||||
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
|
||||
let mut buffer = Vec::new();
|
||||
src.read_to_end(&mut buffer)?;
|
||||
let text = pdf_extract::extract_text_from_mem(&buffer)?;
|
||||
|
||||
Ok(A2j {
|
||||
source: Source {
|
||||
path: "unknown".to_string(),
|
||||
size: buffer.len() as u64,
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: mime_type.to_string(),
|
||||
encoding: None,
|
||||
meta: serde_json::Value::Null, // TODO: Extract metadata
|
||||
text: Some(text),
|
||||
structured: None,
|
||||
attachments: vec![],
|
||||
warnings: vec![],
|
||||
annotations: vec![],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
|
||||
use crate::{A2j, Extractor, Source};
|
||||
use serde_json::Value;
|
||||
use std::io::Read;
|
||||
|
||||
pub struct StructuredTextExtractor;
|
||||
|
||||
impl Extractor for StructuredTextExtractor {
|
||||
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
|
||||
let mut buffer = String::new();
|
||||
src.read_to_string(&mut buffer)?;
|
||||
|
||||
let structured = if mime_type == "application/json" {
|
||||
serde_json::from_str(&buffer)?
|
||||
} else {
|
||||
// For XML, we'll just store the raw XML as a string in the structured field for now.
|
||||
// A more advanced implementation could convert it to a JSON-like structure.
|
||||
Value::String(buffer.clone())
|
||||
};
|
||||
|
||||
Ok(A2j {
|
||||
source: Source {
|
||||
path: "unknown".to_string(),
|
||||
size: buffer.len() as u64,
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: mime_type.to_string(),
|
||||
encoding: Some("utf-8".to_string()),
|
||||
meta: serde_json::Value::Null,
|
||||
text: None,
|
||||
structured: Some(structured),
|
||||
attachments: vec![],
|
||||
warnings: vec![],
|
||||
annotations: vec![],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
use crate::{A2j, Extractor, Source};
|
||||
use std::io::Read;
|
||||
|
||||
pub struct TextExtractor;
|
||||
|
||||
impl Extractor for TextExtractor {
|
||||
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
|
||||
let mut buffer = Vec::new();
|
||||
src.read_to_end(&mut buffer)?;
|
||||
let text = String::from_utf8_lossy(&buffer).to_string();
|
||||
|
||||
Ok(A2j {
|
||||
source: Source {
|
||||
path: "unknown".to_string(),
|
||||
size: buffer.len() as u64,
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: mime_type.to_string(),
|
||||
encoding: Some("utf-8".to_string()),
|
||||
meta: serde_json::Value::Null,
|
||||
text: Some(text),
|
||||
structured: None,
|
||||
attachments: vec![],
|
||||
warnings: vec![],
|
||||
annotations: vec![],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,83 @@
|
|||
|
||||
use crate::{A2j, Extractor, Source};
|
||||
use std::io::{Read, Write};
|
||||
use std::process::Command;
|
||||
use tempfile::{tempdir, NamedTempFile};
|
||||
|
||||
pub struct ZimExtractor;
|
||||
|
||||
impl Extractor for ZimExtractor {
|
||||
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
|
||||
let mut buffer = Vec::new();
|
||||
src.read_to_end(&mut buffer)?;
|
||||
|
||||
// Check if zimdump is installed
|
||||
if Command::new("zimdump").arg("--version").output().is_err() {
|
||||
return Ok(A2j {
|
||||
source: Source {
|
||||
path: "unknown".to_string(),
|
||||
size: buffer.len() as u64,
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: mime_type.to_string(),
|
||||
encoding: None,
|
||||
meta: serde_json::Value::Null,
|
||||
text: None,
|
||||
structured: None,
|
||||
attachments: vec![],
|
||||
warnings: vec!["zimdump tool not found. Please install it to process ZIM files.".to_string()],
|
||||
annotations: vec![],
|
||||
});
|
||||
}
|
||||
|
||||
let mut temp_file = NamedTempFile::new()?;
|
||||
temp_file.write_all(&buffer)?;
|
||||
|
||||
let temp_dir = tempdir()?;
|
||||
|
||||
let output = Command::new("zimdump")
|
||||
.arg("dump")
|
||||
.arg(format!("--dir={}", temp_dir.path().to_str().unwrap()))
|
||||
.arg(temp_file.path())
|
||||
.output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(anyhow::anyhow!("zimdump failed with status: {}\n{}", output.status, String::from_utf8_lossy(&output.stderr)));
|
||||
}
|
||||
|
||||
|
||||
let main_page_path = temp_dir.path().join("index.html");
|
||||
let text = if main_page_path.exists() {
|
||||
Some(std::fs::read_to_string(main_page_path)?)
|
||||
} else {
|
||||
let main_page_path = temp_dir.path().join("index");
|
||||
if main_page_path.exists() {
|
||||
Some(std::fs::read_to_string(main_page_path)?)
|
||||
} else {
|
||||
// If index.html doesn't exist, try to find the first html file
|
||||
let mut html_files = glob::glob(temp_dir.path().join("**/*.html").to_str().unwrap())?;
|
||||
if let Some(Ok(path)) = html_files.next() {
|
||||
Some(std::fs::read_to_string(path)?)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(A2j {
|
||||
source: Source {
|
||||
path: "unknown".to_string(),
|
||||
size: buffer.len() as u64,
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: mime_type.to_string(),
|
||||
encoding: None,
|
||||
meta: serde_json::Value::Null, // TODO: Extract metadata
|
||||
text,
|
||||
structured: None,
|
||||
attachments: vec![],
|
||||
warnings: vec![],
|
||||
annotations: vec![],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,209 @@
|
|||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io::Read;
|
||||
use clap::Parser;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::collections::HashMap;
|
||||
use anyhow::Context;
|
||||
use regex::Regex;
|
||||
|
||||
mod extractors;
|
||||
use extractors::text::TextExtractor;
|
||||
use extractors::html::HtmlExtractor;
|
||||
use extractors::structured_text::StructuredTextExtractor;
|
||||
use extractors::pdf::PdfExtractor;
|
||||
use extractors::zim::ZimExtractor;
|
||||
use extractors::office::OfficeExtractor;
|
||||
use extractors::email::EmailExtractor;
|
||||
use extractors::archive::ArchiveExtractor;
|
||||
|
||||
/// Represents a single annotation found in the text.
|
||||
#[derive(Serialize, Debug)]
|
||||
pub struct Annotation {
|
||||
pub r#type: String,
|
||||
pub value: String,
|
||||
pub pattern_id: String,
|
||||
pub start_offset: Option<usize>,
|
||||
pub end_offset: Option<usize>,
|
||||
}
|
||||
|
||||
/// Represents a single pattern configuration for annotation.
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct PatternConfig {
|
||||
pub id: String,
|
||||
pub regex: String,
|
||||
pub r#type: String,
|
||||
pub case_sensitive: Option<bool>,
|
||||
pub description: Option<String>,
|
||||
}
|
||||
|
||||
/// Common result object serialized to JSON.
|
||||
#[derive(Serialize)]
|
||||
pub struct A2j {
|
||||
pub source: Source,
|
||||
pub content_type: String,
|
||||
pub encoding: Option<String>,
|
||||
pub meta: serde_json::Value,
|
||||
pub text: Option<String>,
|
||||
pub structured: Option<serde_json::Value>,
|
||||
pub attachments: Vec<A2j>,
|
||||
pub warnings: Vec<String>,
|
||||
pub annotations: Vec<Annotation>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct Source {
|
||||
pub path: String,
|
||||
pub size: u64,
|
||||
pub sha256: String,
|
||||
}
|
||||
|
||||
pub trait Extractor: Send + Sync {
|
||||
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j>;
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Cli {
|
||||
/// The path to the file to process, or - for stdin
|
||||
#[arg(value_name = "PATH")]
|
||||
path: String,
|
||||
|
||||
/// Pretty print the output JSON
|
||||
#[arg(short, long)]
|
||||
pretty: bool,
|
||||
|
||||
/// Optional path to a JSON file containing regex patterns for annotation.
|
||||
#[arg(long)]
|
||||
annotations_config: Option<String>,
|
||||
}
|
||||
|
||||
fn apply_patterns(text: &str, patterns: &[PatternConfig]) -> anyhow::Result<Vec<Annotation>> {
|
||||
let mut annotations = Vec::new();
|
||||
for config in patterns {
|
||||
let regex = if config.case_sensitive.unwrap_or(true) {
|
||||
Regex::new(&config.regex)?
|
||||
} else {
|
||||
Regex::new(&format!("(?i){}", config.regex))?
|
||||
};
|
||||
|
||||
for mat in regex.find_iter(text) {
|
||||
annotations.push(Annotation {
|
||||
r#type: config.r#type.clone(),
|
||||
value: mat.as_str().to_string(),
|
||||
pattern_id: config.id.clone(),
|
||||
start_offset: Some(mat.start()),
|
||||
end_offset: Some(mat.end()),
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(annotations)
|
||||
}
|
||||
|
||||
fn load_patterns(config_path: &str) -> anyhow::Result<Vec<PatternConfig>> {
|
||||
let file = File::open(config_path).with_context(|| format!("Failed to open annotations config file: {}", config_path))?;
|
||||
let reader = BufReader::new(file);
|
||||
let patterns: Vec<PatternConfig> = serde_json::from_reader(reader).with_context(|| "Failed to deserialize annotations config")?;
|
||||
Ok(patterns)
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
let mut registry: HashMap<&'static str, Box<dyn Extractor>> = HashMap::new();
|
||||
registry.insert("text/plain", Box::new(TextExtractor));
|
||||
registry.insert("text/html", Box::new(HtmlExtractor));
|
||||
registry.insert("application/json", Box::new(StructuredTextExtractor));
|
||||
registry.insert("text/xml", Box::new(StructuredTextExtractor));
|
||||
registry.insert("application/pdf", Box::new(PdfExtractor));
|
||||
registry.insert("application/x-zim", Box::new(ZimExtractor));
|
||||
registry.insert("application/vnd.openxmlformats-officedocument.wordprocessingml.document", Box::new(OfficeExtractor));
|
||||
registry.insert("application/vnd.openxmlformats-officedocument.presentationml.presentation", Box::new(OfficeExtractor));
|
||||
registry.insert("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", Box::new(OfficeExtractor));
|
||||
registry.insert("message/rfc822", Box::new(EmailExtractor));
|
||||
registry.insert("application/zip", Box::new(ArchiveExtractor));
|
||||
registry.insert("application/gzip", Box::new(ArchiveExtractor));
|
||||
registry.insert("application/x-tar", Box::new(ArchiveExtractor));
|
||||
|
||||
let mut input: Box<dyn Read> = if cli.path == "-" {
|
||||
Box::new(BufReader::new(std::io::stdin()))
|
||||
} else {
|
||||
Box::new(BufReader::new(File::open(&cli.path).with_context(|| format!("Failed to open file: {}", cli.path))?))
|
||||
};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
input.read_to_end(&mut buffer)?;
|
||||
|
||||
let mime_type = if buffer.starts_with(b"ZIM\x04") {
|
||||
"application/x-zim".to_string()
|
||||
} else if buffer.starts_with(b"\x50\x4B\x03\x04") {
|
||||
let path = cli.path.to_lowercase();
|
||||
if path.ends_with(".docx") {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string()
|
||||
} else if path.ends_with(".pptx") {
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation".to_string()
|
||||
} else if path.ends_with(".xlsx") {
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string()
|
||||
} else {
|
||||
match infer::get(&buffer) {
|
||||
Some(kind) => kind.mime_type().to_string(),
|
||||
None => mime_guess::from_path(&cli.path).first_or_text_plain().to_string(),
|
||||
}
|
||||
}
|
||||
} else {
|
||||
match infer::get(&buffer) {
|
||||
Some(kind) => kind.mime_type().to_string(),
|
||||
None => mime_guess::from_path(&cli.path).first_or_text_plain().to_string(),
|
||||
}
|
||||
};
|
||||
|
||||
let fallback_extractor: Box<dyn Extractor> = Box::new(FallbackExtractor);
|
||||
let extractor = registry.get(mime_type.as_str()).unwrap_or(&fallback_extractor);
|
||||
|
||||
let mut result = extractor.extract(&mut std::io::Cursor::new(buffer.clone()), &mime_type)?;
|
||||
result.source.path = cli.path.clone();
|
||||
result.source.size = buffer.len() as u64;
|
||||
|
||||
if let Some(config_path) = cli.annotations_config {
|
||||
let patterns = load_patterns(&config_path)?;
|
||||
if let Some(text) = &result.text {
|
||||
let annotations = apply_patterns(text, &patterns)?;
|
||||
result.annotations = annotations;
|
||||
}
|
||||
}
|
||||
|
||||
let output = if cli.pretty {
|
||||
serde_json::to_string_pretty(&result)?
|
||||
} else {
|
||||
serde_json::to_string(&result)?
|
||||
};
|
||||
|
||||
println!("{}", output);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct FallbackExtractor;
|
||||
impl Extractor for FallbackExtractor {
|
||||
fn extract(&self, src: &mut dyn Read, mime_type: &str) -> anyhow::Result<A2j> {
|
||||
let mut buffer = Vec::new();
|
||||
src.read_to_end(&mut buffer)?;
|
||||
|
||||
Ok(A2j {
|
||||
source: Source {
|
||||
path: "unknown".to_string(),
|
||||
size: buffer.len() as u64,
|
||||
sha256: "unknown".to_string(),
|
||||
},
|
||||
content_type: mime_type.to_string(),
|
||||
encoding: None,
|
||||
meta: serde_json::Value::Null,
|
||||
text: None,
|
||||
structured: None,
|
||||
attachments: vec![],
|
||||
warnings: vec!["No extractor found for this file type".to_string()],
|
||||
annotations: vec![],
|
||||
})
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue