From bb4cade9fd8bb7860bd9ce4d64f01a30ba9c4de1 Mon Sep 17 00:00:00 2001 From: Reiner Herrmann Date: Fri, 28 Jul 2023 17:53:15 +0200 Subject: [PATCH] Preview URLs Closes: #14 --- Cargo.toml | 3 + conduit-example.toml | 3 + debian/postinst | 3 + src/api/client_server/media.rs | 237 +++++++++++++++++++++++++++++++- src/config/mod.rs | 3 + src/database/key_value/media.rs | 109 +++++++++++++++ src/database/mod.rs | 4 + src/main.rs | 1 + src/service/globals/mod.rs | 4 + src/service/media/data.rs | 20 +++ src/service/media/mod.rs | 67 +++++++++ src/service/mod.rs | 9 +- 12 files changed, 460 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c74773a0..766cf49c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -147,6 +147,8 @@ tikv-jemallocator = { version = "0.5.0", features = [ sd-notify = { version = "0.4.1", optional = true } +webpage = { version = "1.6", default-features = false, optional = true } + # Used for matrix spec type definitions and helpers [dependencies.ruma] features = [ @@ -186,6 +188,7 @@ conduit_bin = ["axum"] jemalloc = ["tikv-jemallocator"] sqlite = ["parking_lot", "rusqlite", "tokio/signal"] systemd = ["sd-notify"] +url_preview = ["webpage"] [[bin]] name = "conduit" diff --git a/conduit-example.toml b/conduit-example.toml index 74cbb074..969c0074 100644 --- a/conduit-example.toml +++ b/conduit-example.toml @@ -47,6 +47,9 @@ registration_token = "" allow_check_for_updates = true allow_federation = true +# Allows clients to request a URL preview +allow_url_preview = false + # Enable the display name lightning bolt on registration. enable_lightning_bolt = true diff --git a/debian/postinst b/debian/postinst index 6361af5a..8738ffe8 100644 --- a/debian/postinst +++ b/debian/postinst @@ -84,6 +84,9 @@ allow_check_for_updates = true # Enable the display name lightning bolt on registration. enable_lightning_bolt = true +# Allows clients to request a URL preview +allow_url_preview = false + # Servers listed here will be used to gather public keys of other servers. # Generally, copying this exactly should be enough. (Currently, Conduit doesn't # support batched key requests, so this list should only contain Synapse diff --git a/src/api/client_server/media.rs b/src/api/client_server/media.rs index 5cd2b2f9..75492893 100644 --- a/src/api/client_server/media.rs +++ b/src/api/client_server/media.rs @@ -2,13 +2,22 @@ use std::time::Duration; use crate::{service::media::FileMeta, services, utils, Error, Result, Ruma}; use ruma::api::client::{ - error::ErrorKind, + error::{ErrorKind, RetryAfter}, media::{ create_content, get_content, get_content_as_filename, get_content_thumbnail, - get_media_config, + get_media_config, get_media_preview }, }; +#[cfg(feature = "url_preview")] +use { + crate::service::media::UrlPreviewData, + webpage::HTML, + std::{io::Cursor, net::IpAddr, sync::Arc, time::Duration}, + tokio::sync::Notify, + image::io::Reader as ImgReader, +}; + const MXC_LENGTH: usize = 32; /// # `GET /_matrix/media/r0/config` @@ -22,6 +31,230 @@ pub async fn get_media_config_route( }) } +#[cfg(feature = "url_preview")] +async fn download_image( + client: &reqwest::Client, + url: &str, +) -> Result { + let image = client.get(url).send().await?.bytes().await?; + let mxc = format!( + "mxc://{}/{}", + services().globals.server_name(), + utils::random_string(MXC_LENGTH) + ); + services().media + .create(mxc.clone(), None, None, &image) + .await?; + + let (width, height) = match ImgReader::new(Cursor::new(&image)).with_guessed_format() { + Err(_) => (None, None), + Ok(reader) => match reader.into_dimensions() { + Err(_) => (None, None), + Ok((width, height)) => (Some(width), Some(height)), + }, + }; + + Ok(UrlPreviewData { + image: Some(mxc), + image_size: Some(image.len()), + image_width: width, + image_height: height, + ..Default::default() + }) +} + +#[cfg(feature = "url_preview")] +async fn download_html( + client: &reqwest::Client, + url: &str, +) -> Result { + let max_download_size = 300_000; + + let mut response = client.get(url).send().await?; + + let mut bytes: Vec = Vec::new(); + while let Some(chunk) = response.chunk().await? { + bytes.extend_from_slice(&chunk); + if bytes.len() > max_download_size { + break; + } + } + let body = String::from_utf8_lossy(&bytes); + let html = match HTML::from_string(body.to_string(), Some(url.to_owned())) { + Ok(html) => html, + Err(_) => { + return Err(Error::BadRequest( + ErrorKind::Unknown, + "Failed to parse HTML", + )) + } + }; + + let mut data = match html.opengraph.images.first() { + None => UrlPreviewData::default(), + Some(obj) => download_image(client, &obj.url).await?, + }; + + let props = html.opengraph.properties; + /* use OpenGraph title/description, but fall back to HTML if not available */ + data.title = props.get("title").cloned().or(html.title); + data.description = props.get("description").cloned().or(html.description); + Ok(data) +} + +#[cfg(feature = "url_preview")] +fn url_request_allowed(addr: &IpAddr) -> bool { + // could be implemented with reqwest when it supports IP filtering: + // https://github.com/seanmonstar/reqwest/issues/1515 + + // TODO: simplify to .is_global() when it has been stabilized + match addr { + IpAddr::V4(ip4) => { + !(ip4.is_private() + || ip4.is_loopback() + || ip4.is_link_local() + || ip4.is_multicast() + || ip4.is_broadcast() + || ip4.is_documentation() + || ip4.is_unspecified()) + } + IpAddr::V6(ip6) => !(ip6.is_loopback() || ip6.is_multicast() || ip6.is_unspecified()), + } +} + +#[cfg(feature = "url_preview")] +async fn request_url_preview(url: String) -> Result { + let client = services().globals.default_client(); + let response = client.head(&url).send().await?; + + if !response + .remote_addr() + .map_or(false, |a| url_request_allowed(&a.ip())) + { + return Err(Error::BadRequest( + ErrorKind::Unknown, + "Requesting from this address forbidden", + )); + } + + let content_type = match response + .headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|x| x.to_str().ok()) + { + Some(ct) => ct, + None => { + return Err(Error::BadRequest( + ErrorKind::Unknown, + "Unknown Content-Type", + )) + } + }; + let data = match content_type { + html if html.starts_with("text/html") => download_html(&client, &url).await?, + img if img.starts_with("image/") => download_image(&client, &url).await?, + _ => { + return Err(Error::BadRequest( + ErrorKind::Unknown, + "Unsupported Content-Type", + )) + } + }; + + services().media.set_url_preview(&url, &data).await?; + + Ok(data) +} + +#[cfg(feature = "url_preview")] +async fn get_url_preview(url: String) -> Result { + if let Some(preview) = services().media.get_url_preview(&url).await { + return Ok(preview); + } + + let notif_opt = services() + .media + .url_preview_requests + .read() + .unwrap() + .get(&url) + .cloned(); + + match notif_opt { + None => { + let notifier = Arc::new(Notify::new()); + { + services().media + .url_preview_requests + .write() + .unwrap() + .insert(url.clone(), notifier.clone()); + } + + let data = request_url_preview(url.clone()).await; + + notifier.notify_waiters(); + + { + services().media.url_preview_requests.write().unwrap().remove(&url); + } + + data + } + Some(notifier) => { + // wait until being notified that request is finished + let notifier = notifier.clone(); + let notifier = notifier.notified(); + notifier.await; + + services().media + .get_url_preview(&url) + .await + .ok_or(Error::BadRequest( + ErrorKind::Unknown, + "No Preview available", + )) + } + } +} + +/// # `GET /_matrix/media/r0/preview_url` +/// +/// Returns URL preview. +#[cfg(feature = "url_preview")] +pub async fn get_media_preview_route( + body: Ruma, +) -> Result { + if !services().globals.allow_url_preview() { + return Err(Error::BadRequest( + ErrorKind::Unknown, + "Previewing URL not allowed", + )); + } + + if let Ok(preview) = get_url_preview(body.url.clone()).await { + let res = serde_json::value::to_raw_value(&preview).expect("Converting to JSON failed"); + return Ok(get_media_preview::v3::Response::from_raw_value(res)); + } + + Err(Error::BadRequest( + ErrorKind::LimitExceeded { + retry_after: Some(RetryAfter::Delay(Duration::from_secs(5))), + }, + "Retry later", + )) +} + +#[cfg(not(feature = "url_preview"))] +pub async fn get_media_preview_route( + _body: Ruma, +) -> Result { + Err(Error::BadRequest( + ErrorKind::Forbidden, + "URL preview not implemented", + )) +} + /// # `POST /_matrix/media/r0/upload` /// /// Permanently save media in the server. diff --git a/src/config/mod.rs b/src/config/mod.rs index 378ab929..d661a5de 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -53,6 +53,8 @@ pub struct Config { pub allow_encryption: bool, #[serde(default = "false_fn")] pub allow_federation: bool, + #[serde(default = "false_fn")] + pub allow_url_preview: bool, #[serde(default = "true_fn")] pub allow_room_creation: bool, #[serde(default = "true_fn")] @@ -184,6 +186,7 @@ impl fmt::Display for Config { ), ("Allow encryption", &self.allow_encryption.to_string()), ("Allow federation", &self.allow_federation.to_string()), + ("Allow URL preview", &self.allow_url_preview.to_string()), ("Allow room creation", &self.allow_room_creation.to_string()), ( "JWT secret", diff --git a/src/database/key_value/media.rs b/src/database/key_value/media.rs index 6abe5ba5..2ad5e3fc 100644 --- a/src/database/key_value/media.rs +++ b/src/database/key_value/media.rs @@ -2,6 +2,9 @@ use ruma::api::client::error::ErrorKind; use crate::{database::KeyValueDatabase, service, utils, Error, Result}; +#[cfg(feature = "url_preview")] +use crate::service::media::UrlPreviewData; + impl service::media::Data for KeyValueDatabase { fn create_file_metadata( &self, @@ -79,4 +82,110 @@ impl service::media::Data for KeyValueDatabase { }; Ok((content_disposition, content_type, key)) } + + #[cfg(feature = "url_preview")] + fn remove_url_preview(&self, url: &str) -> Result<()> { + self.url_previews.remove(url.as_bytes()) + } + + #[cfg(feature = "url_preview")] + fn set_url_preview(&self, url: &str, data: &UrlPreviewData, timestamp: std::time::Duration) -> Result<()> { + let mut value = Vec::::new(); + value.extend_from_slice(×tamp.as_secs().to_be_bytes()); + value.push(0xff); + value.extend_from_slice( + data.title + .as_ref() + .map(|t| t.as_bytes()) + .unwrap_or_default(), + ); + value.push(0xff); + value.extend_from_slice( + data.description + .as_ref() + .map(|d| d.as_bytes()) + .unwrap_or_default(), + ); + value.push(0xff); + value.extend_from_slice( + data.image + .as_ref() + .map(|i| i.as_bytes()) + .unwrap_or_default(), + ); + value.push(0xff); + value.extend_from_slice(&data.image_size.unwrap_or(0).to_be_bytes()); + value.push(0xff); + value.extend_from_slice(&data.image_width.unwrap_or(0).to_be_bytes()); + value.push(0xff); + value.extend_from_slice(&data.image_height.unwrap_or(0).to_be_bytes()); + + self.url_previews.insert(url.as_bytes(), &value) + } + + #[cfg(feature = "url_preview")] + fn get_url_preview(&self, url: &str) -> Option { + let values = self.url_previews.get(url.as_bytes()).ok()??; + + let mut values = values.split(|&b| b == 0xff); + + let _ts = match values + .next() + .map(|b| u64::from_be_bytes(b.try_into().expect("valid BE array"))) + { + Some(0) => None, + x => x, + }; + let title = match values + .next() + .and_then(|b| String::from_utf8(b.to_vec()).ok()) + { + Some(s) if s.is_empty() => None, + x => x, + }; + let description = match values + .next() + .and_then(|b| String::from_utf8(b.to_vec()).ok()) + { + Some(s) if s.is_empty() => None, + x => x, + }; + let image = match values + .next() + .and_then(|b| String::from_utf8(b.to_vec()).ok()) + { + Some(s) if s.is_empty() => None, + x => x, + }; + let image_size = match values + .next() + .map(|b| usize::from_be_bytes(b.try_into().expect("valid BE array"))) + { + Some(0) => None, + x => x, + }; + let image_width = match values + .next() + .map(|b| u32::from_be_bytes(b.try_into().expect("valid BE array"))) + { + Some(0) => None, + x => x, + }; + let image_height = match values + .next() + .map(|b| u32::from_be_bytes(b.try_into().expect("valid BE array"))) + { + Some(0) => None, + x => x, + }; + + Some(UrlPreviewData { + title, + description, + image, + image_size, + image_width, + image_height, + }) + } } diff --git a/src/database/mod.rs b/src/database/mod.rs index 5171d4bb..40446e9c 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -146,6 +146,8 @@ pub struct KeyValueDatabase { //pub media: media::Media, pub(super) mediaid_file: Arc, // MediaId = MXC + WidthHeight + ContentDisposition + ContentType + #[cfg(feature = "url_preview")] + pub(super) url_previews: Arc, //pub key_backups: key_backups::KeyBackups, pub(super) backupid_algorithm: Arc, // BackupId = UserId + Version(Count) pub(super) backupid_etag: Arc, // BackupId = UserId + Version(Count) @@ -362,6 +364,8 @@ impl KeyValueDatabase { roomuserdataid_accountdata: builder.open_tree("roomuserdataid_accountdata")?, roomusertype_roomuserdataid: builder.open_tree("roomusertype_roomuserdataid")?, mediaid_file: builder.open_tree("mediaid_file")?, + #[cfg(feature = "url_preview")] + url_previews: builder.open_tree("url_previews")?, backupid_algorithm: builder.open_tree("backupid_algorithm")?, backupid_etag: builder.open_tree("backupid_etag")?, backupkeyid_backup: builder.open_tree("backupkeyid_backup")?, diff --git a/src/main.rs b/src/main.rs index 8d242c53..232aa2cd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -379,6 +379,7 @@ fn routes(config: &Config) -> Router { .ruma_route(client_server::turn_server_route) .ruma_route(client_server::send_event_to_device_route) .ruma_route(client_server::get_media_config_route) + .ruma_route(client_server::get_media_preview_route) .ruma_route(client_server::create_content_route) .ruma_route(client_server::get_content_route) .ruma_route(client_server::get_content_as_filename_route) diff --git a/src/service/globals/mod.rs b/src/service/globals/mod.rs index fc695f86..513f2bc7 100644 --- a/src/service/globals/mod.rs +++ b/src/service/globals/mod.rs @@ -324,6 +324,10 @@ impl Service { self.config.allow_federation } + pub fn allow_url_preview(&self) -> bool { + self.config.allow_url_preview + } + pub fn allow_room_creation(&self) -> bool { self.config.allow_room_creation } diff --git a/src/service/media/data.rs b/src/service/media/data.rs index 75a682cb..c4621814 100644 --- a/src/service/media/data.rs +++ b/src/service/media/data.rs @@ -17,4 +17,24 @@ pub trait Data: Send + Sync { width: u32, height: u32, ) -> Result<(Option, Option, Vec)>; + + #[cfg(feature = "url_preview")] + fn remove_url_preview( + &self, + url: &str + ) -> Result<()>; + + #[cfg(feature = "url_preview")] + fn set_url_preview( + &self, + url: &str, + data: &super::UrlPreviewData, + timestamp: std::time::Duration, + ) -> Result<()>; + + #[cfg(feature = "url_preview")] + fn get_url_preview( + &self, + url: &str + ) -> Option; } diff --git a/src/service/media/mod.rs b/src/service/media/mod.rs index 0340ab49..ef0d752c 100644 --- a/src/service/media/mod.rs +++ b/src/service/media/mod.rs @@ -11,14 +11,62 @@ use tokio::{ io::{AsyncReadExt, AsyncWriteExt, BufReader}, }; +#[cfg(feature = "url_preview")] +use { + std::{ + collections::HashMap, + sync::{Arc, RwLock}, + }, + serde::Serialize, + std::time::SystemTime, + tokio::sync::Notify, +}; + pub struct FileMeta { pub content_disposition: Option, pub content_type: Option, pub file: Vec, } +#[cfg(feature = "url_preview")] +#[derive(Serialize, Default)] +pub struct UrlPreviewData { + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "og:title") + )] + pub title: Option, + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "og:description") + )] + pub description: Option, + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "og:image") + )] + pub image: Option, + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "matrix:image:size") + )] + pub image_size: Option, + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "og:image:width") + )] + pub image_width: Option, + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "og:image:height") + )] + pub image_height: Option, +} + pub struct Service { pub db: &'static dyn Data, + #[cfg(feature = "url_preview")] + pub url_preview_requests: RwLock>>, } impl Service { @@ -225,4 +273,23 @@ impl Service { Ok(None) } } + + #[cfg(feature = "url_preview")] + pub async fn get_url_preview(&self, url: &str) -> Option { + self.db.get_url_preview(url) + } + + #[cfg(feature = "url_preview")] + pub async fn remove_url_preview(&self, url: &str) -> Result<()> { + // TODO: also remove the downloaded image + self.db.remove_url_preview(url) + } + + #[cfg(feature = "url_preview")] + pub async fn set_url_preview(&self, url: &str, data: &UrlPreviewData) -> Result<()> { + let now = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .expect("valid system time"); + self.db.set_url_preview(url, data, now) + } } diff --git a/src/service/mod.rs b/src/service/mod.rs index 4c11bc18..c5ba84a9 100644 --- a/src/service/mod.rs +++ b/src/service/mod.rs @@ -3,6 +3,9 @@ use std::{ sync::{Arc, Mutex as StdMutex}, }; +#[cfg(feature = "url_preview")] +use std::sync::RwLock; + use lru_cache::LruCache; use tokio::sync::{broadcast, Mutex}; @@ -118,7 +121,11 @@ impl Services { account_data: account_data::Service { db }, admin: admin::Service::build(), key_backups: key_backups::Service { db }, - media: media::Service { db }, + media: media::Service { + db, + #[cfg(feature = "url_preview")] + url_preview_requests: RwLock::new(HashMap::new()) + }, sending: sending::Service::build(db, &config), globals: globals::Service::load(db, config)?,