Files
noentropy/src/files/categorizer.rs
glitchySid ba0ea3f221 perf: eliminate unnecessary clones and improve API ergonomics
- PromptBuilder::new now takes &[String] instead of Vec<String>
- GeminiClient::new now takes &str, &[String] instead of owned values
- FileBatch::from_path now takes &Path instead of PathBuf
- categorize_files_offline now takes Vec<String> (ownership) instead of &[String]
- handle_offline_organization now takes FileBatch by value

These changes eliminate ~5-50 KB of unnecessary allocations for typical
file counts, reduce allocator pressure, and improve API clarity by properly
expressing ownership semantics.

No functional changes - all tests pass.
2026-01-08 23:42:10 +05:30

198 lines
5.7 KiB
Rust

use std::collections::HashMap;
use std::path::Path;
use std::sync::LazyLock;
use crate::models::{FileCategory, OrganizationPlan};
static EXTENSION_MAP: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
HashMap::from([
// Images
("jpg", "Images"),
("jpeg", "Images"),
("png", "Images"),
("gif", "Images"),
("bmp", "Images"),
("svg", "Images"),
("webp", "Images"),
("ico", "Images"),
("tiff", "Images"),
("tif", "Images"),
("raw", "Images"),
("heic", "Images"),
("heif", "Images"),
// Documents
("pdf", "Documents"),
("doc", "Documents"),
("docx", "Documents"),
("txt", "Documents"),
("rtf", "Documents"),
("odt", "Documents"),
("xls", "Documents"),
("xlsx", "Documents"),
("ppt", "Documents"),
("pptx", "Documents"),
("csv", "Documents"),
("md", "Documents"),
("epub", "Documents"),
// Installers
("exe", "Installers"),
("msi", "Installers"),
("dmg", "Installers"),
("deb", "Installers"),
("rpm", "Installers"),
("app", "Installers"),
("appimage", "Installers"),
("pkg", "Installers"),
("snap", "Installers"),
// Music
("mp3", "Music"),
("wav", "Music"),
("flac", "Music"),
("aac", "Music"),
("ogg", "Music"),
("wma", "Music"),
("m4a", "Music"),
("opus", "Music"),
("aiff", "Music"),
// Video
("mp4", "Video"),
("mkv", "Video"),
("avi", "Video"),
("mov", "Video"),
("wmv", "Video"),
("flv", "Video"),
("webm", "Video"),
("m4v", "Video"),
("mpeg", "Video"),
("mpg", "Video"),
// Archives
("zip", "Archives"),
("tar", "Archives"),
("gz", "Archives"),
("rar", "Archives"),
("7z", "Archives"),
("bz2", "Archives"),
("xz", "Archives"),
("tgz", "Archives"),
("zst", "Archives"),
// Code
("rs", "Code"),
("py", "Code"),
("js", "Code"),
("ts", "Code"),
("java", "Code"),
("c", "Code"),
("cpp", "Code"),
("h", "Code"),
("hpp", "Code"),
("go", "Code"),
("rb", "Code"),
("php", "Code"),
("html", "Code"),
("css", "Code"),
("json", "Code"),
("yaml", "Code"),
("yml", "Code"),
("toml", "Code"),
("xml", "Code"),
("sh", "Code"),
("bash", "Code"),
("sql", "Code"),
])
});
/// Categorizes a file by its extension.
/// Returns `Some(category)` if the extension is known, `None` otherwise.
pub fn categorize_by_extension(filename: &str) -> Option<&'static str> {
Path::new(filename)
.extension()
.and_then(|ext| ext.to_str())
.map(|ext| ext.to_lowercase())
.as_deref()
.and_then(|ext| EXTENSION_MAP.get(ext).copied())
}
/// Result of offline categorization
pub struct OfflineCategorizationResult {
pub plan: OrganizationPlan,
pub skipped: Vec<String>,
}
/// Categorizes a list of filenames using extension-based rules.
/// Returns categorized files and a list of skipped filenames.
pub fn categorize_files_offline(filenames: Vec<String>) -> OfflineCategorizationResult {
let mut files = Vec::with_capacity(filenames.len());
let mut skipped = Vec::new();
for filename in filenames {
match categorize_by_extension(&filename) {
Some(category) => {
files.push(FileCategory {
filename,
category: category.to_string(),
sub_category: String::new(),
});
}
None => {
skipped.push(filename);
}
}
}
OfflineCategorizationResult {
plan: OrganizationPlan { files },
skipped,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_categorize_known_extensions() {
assert_eq!(categorize_by_extension("photo.jpg"), Some("Images"));
assert_eq!(categorize_by_extension("document.pdf"), Some("Documents"));
assert_eq!(categorize_by_extension("setup.exe"), Some("Installers"));
assert_eq!(categorize_by_extension("song.mp3"), Some("Music"));
assert_eq!(categorize_by_extension("movie.mp4"), Some("Video"));
assert_eq!(categorize_by_extension("archive.zip"), Some("Archives"));
assert_eq!(categorize_by_extension("main.rs"), Some("Code"));
}
#[test]
fn test_categorize_case_insensitive() {
assert_eq!(categorize_by_extension("PHOTO.JPG"), Some("Images"));
assert_eq!(categorize_by_extension("Photo.Png"), Some("Images"));
}
#[test]
fn test_categorize_unknown_extension() {
assert_eq!(categorize_by_extension("file.xyz"), None);
assert_eq!(categorize_by_extension("file.unknown"), None);
}
#[test]
fn test_categorize_no_extension() {
assert_eq!(categorize_by_extension("README"), None);
assert_eq!(categorize_by_extension("Makefile"), None);
}
#[test]
fn test_categorize_files_offline() {
let filenames = vec![
"photo.jpg".to_string(),
"doc.pdf".to_string(),
"unknown".to_string(),
"file.xyz".to_string(),
];
let result = categorize_files_offline(filenames);
assert_eq!(result.plan.files.len(), 2);
assert_eq!(result.skipped.len(), 2);
assert!(result.skipped.contains(&"unknown".to_string()));
assert!(result.skipped.contains(&"file.xyz".to_string()));
}
}