From 19d0ea408cc955a6a04dcca3dbc7733fcdaffbef Mon Sep 17 00:00:00 2001 From: Matthias Ahouansou Date: Sun, 23 Mar 2025 17:23:57 +0000 Subject: [PATCH] feat(media): deep hashed directory structure --- docs/configuration.md | 22 +++++++++- src/config/mod.rs | 84 +++++++++++++++++++++++++++++++++++--- src/main.rs | 54 ++++++++++++++++-------- src/service/globals/mod.rs | 30 +++++++++++--- src/service/media/mod.rs | 18 ++++++-- 5 files changed, 173 insertions(+), 35 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index ffbfa512..3323fb64 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -64,7 +64,7 @@ The `global` section contains the following fields: ### Media The `media` table is used to configure how media is stored and where. Currently, there is only one available backend, that being `filesystem`. The backend can be set using the `backend` field. Example: -``` +```toml [global.media] backend = "filesystem" # the default backend ``` @@ -73,12 +73,30 @@ backend = "filesystem" # the default backend The filesystem backend has the following fields: - `path`: The base directory where all the media files will be stored (defaults to `${database_path}/media`) +- `directory_structure`: This is a table, used to configure how files are to be distributed within + the media directory. It has the following fields: + - `depth`: The number sub-directories that should be created for files (default: `2`) + - `length`: How long the name of these sub-directories should be (default: `2`) + For example, a file may regularly have the name `98ea6e4f216f2fb4b69fff9b3a44842c38686ca685f3f55dc48c5d3fb1107be4` + (The SHA256 digest of the file's content). If `depth` and `length` were both set to `2`, this file would be stored + at `${path}/98/ea/6e4f216f2fb4b69fff9b3a44842c38686ca685f3f55dc48c5d3fb1107be4`. If you want to instead have all + media files in the base directory with no sub-directories, just set `directory_structure` to be empty, as follows: + ```toml + [global.media] + backend = "filesystem" + + [global.media.directory_structure] + ``` ##### Example: -``` +```toml [global.media] backend = "filesystem" path = "/srv/matrix-media" + +[global.media.directory_structure] +depth = 4 +length = 2 ``` ### TLS diff --git a/src/config/mod.rs b/src/config/mod.rs index 370dcfec..46dcf7e0 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2,6 +2,7 @@ use std::{ collections::BTreeMap, fmt, net::{IpAddr, Ipv4Addr}, + num::NonZeroU8, path::PathBuf, }; @@ -10,10 +11,13 @@ use serde::{de::IgnoredAny, Deserialize}; use tracing::warn; use url::Url; -mod proxy; +use crate::Error; +mod proxy; use self::proxy::ProxyConfig; +const SHA256_HEX_LENGTH: u8 = 64; + #[derive(Deserialize)] pub struct IncompleteConfig { #[serde(default = "default_address")] @@ -218,7 +222,10 @@ impl From for Config { }; let media = match media { - IncompleteMediaConfig::FileSystem { path } => MediaConfig::FileSystem { + IncompleteMediaConfig::FileSystem { + path, + directory_structure, + } => MediaConfig::FileSystem { path: path.unwrap_or_else(|| { // We do this as we don't know if the path has a trailing slash, or even if the // path separator is a forward or backward slash @@ -229,6 +236,7 @@ impl From for Config { .into_string() .expect("Both inputs are valid UTF-8") }), + directory_structure, }, }; @@ -309,21 +317,85 @@ pub struct WellKnownConfig { pub server: OwnedServerName, } -#[derive(Clone, Debug, Deserialize)] +#[derive(Deserialize)] #[serde(tag = "backend", rename_all = "lowercase")] pub enum IncompleteMediaConfig { - FileSystem { path: Option }, + FileSystem { + path: Option, + #[serde(default)] + directory_structure: DirectoryStructure, + }, } impl Default for IncompleteMediaConfig { fn default() -> Self { - Self::FileSystem { path: None } + Self::FileSystem { + path: None, + directory_structure: DirectoryStructure::default(), + } } } #[derive(Debug, Clone)] pub enum MediaConfig { - FileSystem { path: String }, + FileSystem { + path: String, + directory_structure: DirectoryStructure, + }, +} + +#[derive(Debug, Clone, Deserialize)] +// See https://github.com/serde-rs/serde/issues/642#issuecomment-525432907 +#[serde(try_from = "ShadowDirectoryStructure", untagged)] +pub enum DirectoryStructure { + // We do this enum instead of Option, so that we can have the structure be + // deep by default, while still providing a away for it to be flat (by creating an empty table) + // + // e.g.: + // ```toml + // [global.media.directory_structure] + // ``` + Flat, + Deep { length: NonZeroU8, depth: NonZeroU8 }, +} + +impl Default for DirectoryStructure { + fn default() -> Self { + Self::Deep { + length: NonZeroU8::new(2).expect("2 is not 0"), + depth: NonZeroU8::new(2).expect("2 is not 0"), + } + } +} + +#[derive(Deserialize)] +#[serde(untagged)] +enum ShadowDirectoryStructure { + Flat {}, + Deep { length: NonZeroU8, depth: NonZeroU8 }, +} + +impl TryFrom for DirectoryStructure { + type Error = Error; + + fn try_from(value: ShadowDirectoryStructure) -> Result { + match value { + ShadowDirectoryStructure::Flat {} => Ok(Self::Flat), + ShadowDirectoryStructure::Deep { length, depth } => { + if length + .get() + .checked_mul(depth.get()) + .map(|product| product < SHA256_HEX_LENGTH) + // If an overflow occurs, it definitely isn't less than SHA256_HEX_LENGTH + .unwrap_or(false) + { + Ok(Self::Deep { length, depth }) + } else { + Err(Error::bad_config("The media directory structure depth multiplied by the depth is equal to or greater than a sha256 hex hash, please reduce at least one of the two so that their product is less than 64")) + } + } + } + } } const DEPRECATED_KEYS: &[&str] = &[ diff --git a/src/main.rs b/src/main.rs index 96aa2714..01af9ad2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -47,33 +47,53 @@ static GLOBAL: Jemalloc = Jemalloc; static SUB_TABLES: [&str; 3] = ["well_known", "tls", "media"]; // Not doing `proxy` cause setting that with env vars would be a pain +// Yeah, I know it's terrible, but since it seems the container users dont want syntax like A[B][C]="...", +// this is what we have to deal with. Also see: https://github.com/SergioBenitez/Figment/issues/12#issuecomment-801449465 +static SUB_SUB_TABLES: [&str; 1] = ["directory_structure"]; + #[tokio::main] async fn main() { clap::parse(); // Initialize config - let raw_config = - Figment::new() - .merge( - Toml::file(Env::var("CONDUIT_CONFIG").expect( + let raw_config = Figment::new() + .merge( + Toml::file( + Env::var("CONDUIT_CONFIG").expect( "The CONDUIT_CONFIG env var needs to be set. Example: /etc/conduit.toml", - )) - .nested(), + ), ) - .merge(Env::prefixed("CONDUIT_").global().map(|k| { - let mut key: Uncased = k.into(); + .nested(), + ) + .merge(Env::prefixed("CONDUIT_").global().map(|k| { + let mut key: Uncased = k.into(); - for table in SUB_TABLES { - if k.starts_with(&(table.to_owned() + "_")) { - key = Uncased::from( - table.to_owned() + "." + k[table.len() + 1..k.len()].as_str(), - ); - break; + 'outer: for table in SUB_TABLES { + if k.starts_with(&(table.to_owned() + "_")) { + for sub_table in SUB_SUB_TABLES { + if k.starts_with(&(table.to_owned() + "_" + sub_table + "_")) { + key = Uncased::from( + table.to_owned() + + "." + + sub_table + + "." + + k[table.len() + 1 + sub_table.len() + 1..k.len()].as_str(), + ); + + break 'outer; + } } - } - key - })); + key = Uncased::from( + table.to_owned() + "." + k[table.len() + 1..k.len()].as_str(), + ); + + break; + } + } + + key + })); let config = match raw_config.extract::() { Ok(s) => s, diff --git a/src/service/globals/mod.rs b/src/service/globals/mod.rs index d7cf19b1..ac77afe9 100644 --- a/src/service/globals/mod.rs +++ b/src/service/globals/mod.rs @@ -8,7 +8,7 @@ use ruma::{ use crate::api::server_server::DestinationResponse; use crate::{ - config::{MediaConfig, TurnConfig}, + config::{DirectoryStructure, MediaConfig, TurnConfig}, services, Config, Error, Result, }; use futures_util::FutureExt; @@ -230,7 +230,7 @@ impl Service { // Remove this exception once other media backends are added #[allow(irrefutable_let_patterns)] - if let MediaConfig::FileSystem { path } = &s.config.media { + if let MediaConfig::FileSystem { path, .. } = &s.config.media { fs::create_dir_all(path)?; } @@ -482,14 +482,32 @@ impl Service { self.db.bump_database_version(new_version) } - pub fn get_media_path(&self, media_directory: &str, sha256_hex: &str) -> PathBuf { + pub fn get_media_path( + &self, + media_directory: &str, + directory_structure: &DirectoryStructure, + sha256_hex: &str, + ) -> Result { let mut r = PathBuf::new(); r.push(media_directory); - //TODO: Directory distribution - r.push(sha256_hex); + if let DirectoryStructure::Deep { length, depth } = directory_structure { + let mut filename = sha256_hex; + for _ in 0..depth.get() { + let (current_path, next) = filename.split_at(length.get().into()); + filename = next; + r.push(current_path); + } - r + // Create all directories leading up to file + fs::create_dir_all(&r).inspect_err(|e| error!("Error creating leading directories for media with sha256 hash of {sha256_hex}: {e}"))?; + + r.push(filename); + } else { + r.push(sha256_hex); + } + + Ok(r) } pub fn shutdown(&self) { diff --git a/src/service/media/mod.rs b/src/service/media/mod.rs index 81d66210..447ed566 100644 --- a/src/service/media/mod.rs +++ b/src/service/media/mod.rs @@ -298,8 +298,13 @@ impl Service { /// Note: this function does NOT set the metadata related to the file pub async fn create_file(sha256_hex: &str, file: &[u8]) -> Result<()> { match &services().globals.config.media { - MediaConfig::FileSystem { path } => { - let path = services().globals.get_media_path(path, sha256_hex); + MediaConfig::FileSystem { + path, + directory_structure, + } => { + let path = services() + .globals + .get_media_path(path, directory_structure, sha256_hex)?; let mut f = File::create(path).await?; f.write_all(file).await?; @@ -312,8 +317,13 @@ pub async fn create_file(sha256_hex: &str, file: &[u8]) -> Result<()> { /// Fetches the file from the configured media backend async fn get_file(sha256_hex: &str) -> Result> { Ok(match &services().globals.config.media { - MediaConfig::FileSystem { path } => { - let path = services().globals.get_media_path(path, sha256_hex); + MediaConfig::FileSystem { + path, + directory_structure, + } => { + let path = services() + .globals + .get_media_path(path, directory_structure, sha256_hex)?; let mut file = Vec::new(); File::open(path).await?.read_to_end(&mut file).await?;