fix: fix mikan web extractors

This commit is contained in:
2025-02-25 01:02:38 +08:00
parent 09565bd827
commit 5bc5d98823
26 changed files with 9537 additions and 659 deletions

View File

@@ -1,3 +1,5 @@
use std::{borrow::Cow, error::Error as StdError};
use thiserror::Error;
#[derive(Error, Debug)]
@@ -16,4 +18,19 @@ pub enum ExtractError {
MikanRssFormatError { url: String },
#[error("Parse mikan rss item format error, {reason}")]
MikanRssItemFormatError { reason: String },
#[error("Missing field {field} in extracting meta")]
MikanMetaMissingFieldError {
field: Cow<'static, str>,
#[source]
source: Option<Box<dyn StdError + Send + Sync>>,
},
}
impl ExtractError {
pub fn from_mikan_meta_missing_field(field: Cow<'static, str>) -> Self {
Self::MikanMetaMissingFieldError {
field,
source: None,
}
}
}

View File

@@ -1,3 +1,11 @@
pub mod styles;
pub use styles::parse_style_attr;
use html_escape::decode_html_entities;
use itertools::Itertools;
use scraper::ElementRef;
pub use styles::{extract_background_image_src_from_style_attr, extract_style_from_attr};
pub fn extract_inner_text_from_element_ref(el: ElementRef<'_>) -> String {
let raw_text = el.text().collect_vec().join(",");
decode_html_entities(&raw_text).trim().to_string()
}

View File

@@ -1,6 +1,45 @@
use lightningcss::declaration::DeclarationBlock;
use lightningcss::{
declaration::DeclarationBlock, properties::Property, values::image::Image as CSSImage,
};
use url::Url;
pub fn parse_style_attr(style_attr: &str) -> Option<DeclarationBlock> {
use crate::extract::media::extract_image_src_from_str;
pub fn extract_style_from_attr(style_attr: &str) -> Option<DeclarationBlock> {
let result = DeclarationBlock::parse_string(style_attr, Default::default()).ok()?;
Some(result)
}
pub fn extract_background_image_src_from_style_attr(
style_attr: &str,
base_url: &Url,
) -> Option<Url> {
extract_style_from_attr(style_attr).and_then(|style| {
style.iter().find_map(|(prop, _)| {
match prop {
Property::BackgroundImage(images) => {
for img in images {
if let CSSImage::Url(path) = img {
if let Some(url) = extract_image_src_from_str(path.url.trim(), base_url)
{
return Some(url);
}
}
}
}
Property::Background(backgrounds) => {
for bg in backgrounds {
if let CSSImage::Url(path) = &bg.image {
if let Some(url) = extract_image_src_from_str(path.url.trim(), base_url)
{
return Some(url);
}
}
}
}
_ => {}
}
None
})
})
}

View File

@@ -0,0 +1,8 @@
use url::Url;
pub fn extract_image_src_from_str(image_src: &str, base_url: &Url) -> Option<Url> {
let mut image_url = base_url.join(image_src).ok()?;
image_url.set_query(None);
image_url.set_fragment(None);
Some(image_url)
}

View File

@@ -3,15 +3,17 @@ use std::ops::Deref;
use async_trait::async_trait;
use loco_rs::app::{AppContext, Initializer};
use once_cell::sync::OnceCell;
use url::Url;
use super::AppMikanConfig;
use crate::{config::AppConfigExt, fetch::HttpClient};
static APP_MIKAN_CLIENT: OnceCell<AppMikanClient> = OnceCell::new();
#[derive(Debug)]
pub struct AppMikanClient {
http_client: HttpClient,
base_url: String,
base_url: Url,
}
impl AppMikanClient {
@@ -31,7 +33,7 @@ impl AppMikanClient {
.expect("AppMikanClient is not initialized")
}
pub fn base_url(&self) -> &str {
pub fn base_url(&self) -> &Url {
&self.base_url
}
}

View File

@@ -1,9 +1,10 @@
use serde::{Deserialize, Serialize};
use url::Url;
use crate::fetch::HttpClientConfig;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct AppMikanConfig {
pub http_client: HttpClientConfig,
pub base_url: String,
pub base_url: Url,
}

View File

@@ -1,22 +1,22 @@
pub mod client;
pub mod config;
pub mod constants;
pub mod rss_parser;
pub mod web_parser;
pub mod rss_extract;
pub mod web_extract;
pub use client::{AppMikanClient, AppMikanClientInitializer};
pub use config::AppMikanConfig;
pub use constants::MIKAN_BUCKET_KEY;
pub use rss_parser::{
build_mikan_bangumi_rss_link, build_mikan_subscriber_aggregation_rss_link,
parse_mikan_bangumi_id_from_rss_link, parse_mikan_rss_channel_from_rss_link,
parse_mikan_rss_items_from_rss_link, parse_mikan_subscriber_aggregation_id_from_rss_link,
pub use rss_extract::{
MikanBangumiAggregationRssChannel, MikanBangumiRssChannel, MikanBangumiRssLink,
MikanRssChannel, MikanRssItem, MikanSubscriberAggregationRssChannel,
MikanSubscriberAggregationRssLink,
MikanSubscriberAggregationRssLink, build_mikan_bangumi_rss_link,
build_mikan_subscriber_aggregation_rss_link, extract_mikan_bangumi_id_from_rss_link,
extract_mikan_subscriber_aggregation_id_from_rss_link, parse_mikan_rss_channel_from_rss_link,
parse_mikan_rss_items_from_rss_link,
};
pub use web_parser::{
build_mikan_bangumi_homepage, build_mikan_episode_homepage,
parse_mikan_bangumi_meta_from_mikan_homepage, parse_mikan_episode_meta_from_mikan_homepage,
MikanBangumiMeta, MikanEpisodeMeta,
pub use web_extract::{
MikanBangumiMeta, MikanEpisodeMeta, build_mikan_bangumi_homepage, build_mikan_episode_homepage,
extract_mikan_bangumi_meta_from_bangumi_homepage,
extract_mikan_episode_meta_from_episode_homepage,
};

View File

@@ -1,6 +1,7 @@
use std::ops::Deref;
use chrono::DateTime;
use color_eyre::eyre;
use itertools::Itertools;
use reqwest::IntoUrl;
use serde::{Deserialize, Serialize};
@@ -10,8 +11,8 @@ use crate::{
extract::{
errors::ExtractError,
mikan::{
web_parser::{parse_mikan_episode_id_from_homepage, MikanEpisodeHomepage},
AppMikanClient,
web_extract::{MikanEpisodeHomepage, parse_mikan_episode_id_from_homepage},
},
},
fetch::bytes::fetch_bytes,
@@ -163,11 +164,11 @@ pub struct MikanSubscriberAggregationRssLink {
}
pub fn build_mikan_bangumi_rss_link(
mikan_base_url: &str,
mikan_base_url: impl IntoUrl,
mikan_bangumi_id: &str,
mikan_fansub_id: Option<&str>,
) -> color_eyre::eyre::Result<Url> {
let mut url = Url::parse(mikan_base_url)?;
) -> eyre::Result<Url> {
let mut url = mikan_base_url.into_url()?;
url.set_path("/RSS/Bangumi");
url.query_pairs_mut()
.append_pair("bangumiId", mikan_bangumi_id);
@@ -181,7 +182,7 @@ pub fn build_mikan_bangumi_rss_link(
pub fn build_mikan_subscriber_aggregation_rss_link(
mikan_base_url: &str,
mikan_aggregation_id: &str,
) -> color_eyre::eyre::Result<Url> {
) -> eyre::Result<Url> {
let mut url = Url::parse(mikan_base_url)?;
url.set_path("/RSS/MyBangumi");
url.query_pairs_mut()
@@ -189,7 +190,7 @@ pub fn build_mikan_subscriber_aggregation_rss_link(
Ok(url)
}
pub fn parse_mikan_bangumi_id_from_rss_link(url: &Url) -> Option<MikanBangumiRssLink> {
pub fn extract_mikan_bangumi_id_from_rss_link(url: &Url) -> Option<MikanBangumiRssLink> {
if url.path() == "/RSS/Bangumi" {
url.query_pairs()
.find(|(k, _)| k == "bangumiId")
@@ -205,7 +206,7 @@ pub fn parse_mikan_bangumi_id_from_rss_link(url: &Url) -> Option<MikanBangumiRss
}
}
pub fn parse_mikan_subscriber_aggregation_id_from_rss_link(
pub fn extract_mikan_subscriber_aggregation_id_from_rss_link(
url: &Url,
) -> Option<MikanSubscriberAggregationRssLink> {
if url.path() == "/RSS/MyBangumi" {
@@ -222,7 +223,7 @@ pub fn parse_mikan_subscriber_aggregation_id_from_rss_link(
pub async fn parse_mikan_rss_items_from_rss_link(
client: Option<&AppMikanClient>,
url: impl IntoUrl,
) -> color_eyre::eyre::Result<Vec<MikanRssItem>> {
) -> eyre::Result<Vec<MikanRssItem>> {
let channel = parse_mikan_rss_channel_from_rss_link(client, url).await?;
Ok(channel.into_items())
@@ -231,7 +232,7 @@ pub async fn parse_mikan_rss_items_from_rss_link(
pub async fn parse_mikan_rss_channel_from_rss_link(
client: Option<&AppMikanClient>,
url: impl IntoUrl,
) -> color_eyre::eyre::Result<MikanRssChannel> {
) -> eyre::Result<MikanRssChannel> {
let http_client = client.map(|s| s.deref());
let bytes = fetch_bytes(http_client, url.as_str()).await?;
@@ -242,7 +243,7 @@ pub async fn parse_mikan_rss_channel_from_rss_link(
if let Some(MikanBangumiRssLink {
mikan_bangumi_id,
mikan_fansub_id,
}) = parse_mikan_bangumi_id_from_rss_link(&channel_link)
}) = extract_mikan_bangumi_id_from_rss_link(&channel_link)
{
let channel_name = channel.title().replace("Mikan Project - ", "");
@@ -274,7 +275,7 @@ pub async fn parse_mikan_rss_channel_from_rss_link(
} else if let Some(MikanSubscriberAggregationRssLink {
mikan_aggregation_id,
..
}) = parse_mikan_subscriber_aggregation_id_from_rss_link(&channel_link)
}) = extract_mikan_subscriber_aggregation_id_from_rss_link(&channel_link)
{
let items = channel
.items
@@ -304,8 +305,8 @@ mod tests {
use crate::{
extract::mikan::{
parse_mikan_rss_channel_from_rss_link, MikanBangumiAggregationRssChannel,
MikanBangumiRssChannel, MikanRssChannel,
MikanBangumiAggregationRssChannel, MikanBangumiRssChannel, MikanRssChannel,
parse_mikan_rss_channel_from_rss_link,
},
sync::core::BITTORRENT_MIME_TYPE,
};
@@ -333,10 +334,12 @@ mod tests {
assert_eq!(first_sub_item.mime, BITTORRENT_MIME_TYPE);
assert!(&first_sub_item
.homepage
.as_str()
.starts_with("https://mikanani.me/Home/Episode"));
assert!(
&first_sub_item
.homepage
.as_str()
.starts_with("https://mikanani.me/Home/Episode")
);
let name = first_sub_item.title.as_str();
assert!(name.contains("葬送的芙莉莲"));

View File

@@ -0,0 +1,644 @@
use std::{borrow::Cow, ops::Deref};
use bytes::Bytes;
use color_eyre::eyre;
use loco_rs::app::AppContext;
use reqwest::IntoUrl;
use scraper::{Html, Selector};
use tracing::instrument;
use url::Url;
use super::{
AppMikanClient, MIKAN_BUCKET_KEY, MikanBangumiRssLink, extract_mikan_bangumi_id_from_rss_link,
};
use crate::{
app::AppContextExt,
dal::DalContentCategory,
extract::{
errors::ExtractError,
html::{extract_background_image_src_from_style_attr, extract_inner_text_from_element_ref},
media::extract_image_src_from_str,
},
fetch::{html::fetch_html, image::fetch_image},
};
#[derive(Clone, Debug, PartialEq)]
pub struct MikanEpisodeMeta {
pub homepage: Url,
pub origin_poster_src: Option<Url>,
pub bangumi_title: String,
pub episode_title: String,
pub fansub: String,
pub mikan_bangumi_id: String,
pub mikan_fansub_id: String,
pub mikan_episode_id: String,
}
#[derive(Clone, Debug, PartialEq)]
pub struct MikanBangumiMeta {
pub homepage: Url,
pub origin_poster_src: Option<Url>,
pub bangumi_title: String,
pub mikan_bangumi_id: String,
pub mikan_fansub_id: Option<String>,
pub fansub: Option<String>,
}
#[derive(Clone, Debug, PartialEq)]
pub struct MikanBangumiPosterMeta {
pub origin_poster_src: Url,
pub poster_data: Option<Bytes>,
pub poster_src: Option<String>,
}
#[derive(Clone, Debug, PartialEq)]
pub struct MikanEpisodeHomepage {
pub mikan_episode_id: String,
}
#[derive(Clone, Debug, PartialEq)]
pub struct MikanBangumiHomepage {
pub mikan_bangumi_id: String,
pub mikan_fansub_id: Option<String>,
}
pub fn build_mikan_bangumi_homepage(
mikan_base_url: impl IntoUrl,
mikan_bangumi_id: &str,
mikan_fansub_id: Option<&str>,
) -> eyre::Result<Url> {
let mut url = mikan_base_url.into_url()?;
url.set_path(&format!("/Home/Bangumi/{mikan_bangumi_id}"));
url.set_fragment(mikan_fansub_id);
Ok(url)
}
pub fn build_mikan_episode_homepage(
mikan_base_url: impl IntoUrl,
mikan_episode_id: &str,
) -> eyre::Result<Url> {
let mut url = mikan_base_url.into_url()?;
url.set_path(&format!("/Home/Episode/{mikan_episode_id}"));
Ok(url)
}
pub fn build_mikan_bangumi_expand_info_url(
mikan_base_url: impl IntoUrl,
mikan_bangumi_id: &str,
) -> eyre::Result<Url> {
let mut url = mikan_base_url.into_url()?;
url.set_path("/ExpandBangumi");
url.query_pairs_mut()
.append_pair("bangumiId", mikan_bangumi_id)
.append_pair("showSubscribed", "true");
Ok(url)
}
pub fn parse_mikan_bangumi_id_from_homepage(url: &Url) -> Option<MikanBangumiHomepage> {
if url.path().starts_with("/Home/Bangumi/") {
let mikan_bangumi_id = url.path().replace("/Home/Bangumi/", "");
Some(MikanBangumiHomepage {
mikan_bangumi_id,
mikan_fansub_id: url.fragment().map(String::from),
})
} else {
None
}
}
pub fn parse_mikan_episode_id_from_homepage(url: &Url) -> Option<MikanEpisodeHomepage> {
if url.path().starts_with("/Home/Episode/") {
let mikan_episode_id = url.path().replace("/Home/Episode/", "");
Some(MikanEpisodeHomepage { mikan_episode_id })
} else {
None
}
}
pub async fn extract_mikan_poster_meta_from_src(
client: Option<&AppMikanClient>,
origin_poster_src_url: Url,
) -> eyre::Result<MikanBangumiPosterMeta> {
let http_client = client.map(|s| s.deref());
let poster_data = fetch_image(http_client, origin_poster_src_url.clone()).await?;
Ok(MikanBangumiPosterMeta {
origin_poster_src: origin_poster_src_url,
poster_data: Some(poster_data),
poster_src: None,
})
}
pub async fn extract_mikan_bangumi_poster_meta_from_src_with_cache(
ctx: &AppContext,
origin_poster_src_url: Url,
subscriber_id: i32,
) -> eyre::Result<MikanBangumiPosterMeta> {
let dal_client = ctx.get_dal_client();
let mikan_client = ctx.get_mikan_client();
if let Some(poster_src) = dal_client
.exists_object(
DalContentCategory::Image,
subscriber_id,
Some(MIKAN_BUCKET_KEY),
&origin_poster_src_url.path().replace("/images/Bangumi/", ""),
)
.await?
{
return Ok(MikanBangumiPosterMeta {
origin_poster_src: origin_poster_src_url,
poster_data: None,
poster_src: Some(poster_src.to_string()),
});
}
let poster_data =
fetch_image(Some(mikan_client.deref()), origin_poster_src_url.clone()).await?;
let poster_str = dal_client
.store_object(
DalContentCategory::Image,
subscriber_id,
Some(MIKAN_BUCKET_KEY),
&origin_poster_src_url.path().replace("/images/Bangumi/", ""),
poster_data.clone(),
)
.await?;
Ok(MikanBangumiPosterMeta {
origin_poster_src: origin_poster_src_url,
poster_data: Some(poster_data),
poster_src: Some(poster_str.to_string()),
})
}
#[instrument(skip_all, fields(mikan_episode_homepage_url = mikan_episode_homepage_url.as_str()))]
pub async fn extract_mikan_episode_meta_from_episode_homepage(
client: Option<&AppMikanClient>,
mikan_episode_homepage_url: Url,
) -> eyre::Result<MikanEpisodeMeta> {
let http_client = client.map(|s| s.deref());
let mikan_base_url = Url::parse(&mikan_episode_homepage_url.origin().unicode_serialization())?;
let content = fetch_html(http_client, mikan_episode_homepage_url.as_str()).await?;
let html = Html::parse_document(&content);
let bangumi_title_selector =
&Selector::parse(".bangumi-title > a[href^='/Home/Bangumi/']").unwrap();
let mikan_bangumi_id_selector =
&Selector::parse(".bangumi-title > a.mikan-rss[data-original-title='RSS']").unwrap();
let bangumi_poster_selector = &Selector::parse(".bangumi-poster").unwrap();
let bangumi_title = html
.select(bangumi_title_selector)
.next()
.map(extract_inner_text_from_element_ref)
.ok_or_else(|| ExtractError::from_mikan_meta_missing_field(Cow::Borrowed("bangumi_title")))
.inspect_err(|error| {
tracing::warn!(error = %error);
})?;
let MikanBangumiRssLink {
mikan_bangumi_id,
mikan_fansub_id,
..
} = html
.select(mikan_bangumi_id_selector)
.next()
.and_then(|el| el.value().attr("href"))
.and_then(|s| mikan_episode_homepage_url.join(s).ok())
.and_then(|rss_link_url| extract_mikan_bangumi_id_from_rss_link(&rss_link_url))
.ok_or_else(|| {
ExtractError::from_mikan_meta_missing_field(Cow::Borrowed("mikan_bangumi_id"))
})
.inspect_err(|error| tracing::error!(error = %error))?;
let mikan_fansub_id = mikan_fansub_id
.ok_or_else(|| {
ExtractError::from_mikan_meta_missing_field(Cow::Borrowed("mikan_fansub_id"))
})
.inspect_err(|error| tracing::error!(error = %error))?;
let episode_title = html
.select(&Selector::parse("title").unwrap())
.next()
.map(extract_inner_text_from_element_ref)
.ok_or_else(|| ExtractError::from_mikan_meta_missing_field(Cow::Borrowed("episode_title")))
.inspect_err(|error| {
tracing::warn!(error = %error);
})?;
let MikanEpisodeHomepage {
mikan_episode_id, ..
} = parse_mikan_episode_id_from_homepage(&mikan_episode_homepage_url)
.ok_or_else(|| {
ExtractError::from_mikan_meta_missing_field(Cow::Borrowed("mikan_episode_id"))
})
.inspect_err(|error| {
tracing::warn!(error = %error);
})?;
let fansub_name = html
.select(
&Selector::parse(".bangumi-info a.magnet-link-wrap[href^='/Home/PublishGroup/']")
.unwrap(),
)
.next()
.map(extract_inner_text_from_element_ref)
.ok_or_else(|| ExtractError::from_mikan_meta_missing_field(Cow::Borrowed("fansub_name")))
.inspect_err(|error| {
tracing::warn!(error = %error);
})?;
let origin_poster_src = html.select(bangumi_poster_selector).next().and_then(|el| {
el.value()
.attr("data-src")
.and_then(|data_src| extract_image_src_from_str(data_src, &mikan_base_url))
.or_else(|| {
el.value().attr("style").and_then(|style| {
extract_background_image_src_from_style_attr(style, &mikan_base_url)
})
})
});
tracing::trace!(
bangumi_title,
mikan_bangumi_id,
episode_title,
mikan_episode_id,
origin_poster_src = origin_poster_src.as_ref().map(|url| url.as_str()),
fansub_name,
mikan_fansub_id,
"mikan episode meta extracted"
);
Ok(MikanEpisodeMeta {
mikan_bangumi_id,
mikan_fansub_id,
bangumi_title,
episode_title,
homepage: mikan_episode_homepage_url,
origin_poster_src,
fansub: fansub_name,
mikan_episode_id,
})
}
#[instrument(skip_all, fields(mikan_bangumi_homepage_url = mikan_bangumi_homepage_url.as_str()))]
pub async fn extract_mikan_bangumi_meta_from_bangumi_homepage(
client: Option<&AppMikanClient>,
mikan_bangumi_homepage_url: Url,
) -> eyre::Result<MikanBangumiMeta> {
let http_client = client.map(|s| s.deref());
let mikan_base_url = Url::parse(&mikan_bangumi_homepage_url.origin().unicode_serialization())?;
let content = fetch_html(http_client, mikan_bangumi_homepage_url.as_str()).await?;
let html = Html::parse_document(&content);
let bangumi_title_selector = &Selector::parse(".bangumi-title").unwrap();
let mikan_bangumi_id_selector =
&Selector::parse(".bangumi-title > .mikan-rss[data-original-title='RSS']").unwrap();
let bangumi_poster_selector = &Selector::parse(".bangumi-poster").unwrap();
let bangumi_title = html
.select(bangumi_title_selector)
.next()
.map(extract_inner_text_from_element_ref)
.ok_or_else(|| ExtractError::from_mikan_meta_missing_field(Cow::Borrowed("bangumi_title")))
.inspect_err(|error| tracing::warn!(error = %error))?;
let mikan_bangumi_id = html
.select(mikan_bangumi_id_selector)
.next()
.and_then(|el| el.value().attr("href"))
.and_then(|s| mikan_bangumi_homepage_url.join(s).ok())
.and_then(|rss_link_url| extract_mikan_bangumi_id_from_rss_link(&rss_link_url))
.map(
|MikanBangumiRssLink {
mikan_bangumi_id, ..
}| mikan_bangumi_id,
)
.ok_or_else(|| {
ExtractError::from_mikan_meta_missing_field(Cow::Borrowed("mikan_bangumi_id"))
})
.inspect_err(|error| tracing::error!(error = %error))?;
let origin_poster_src = html.select(bangumi_poster_selector).next().and_then(|el| {
el.value()
.attr("data-src")
.and_then(|data_src| extract_image_src_from_str(data_src, &mikan_base_url))
.or_else(|| {
el.value().attr("style").and_then(|style| {
extract_background_image_src_from_style_attr(style, &mikan_base_url)
})
})
});
let (mikan_fansub_id, fansub_name) = mikan_bangumi_homepage_url
.fragment()
.and_then(|id| {
html.select(
&Selector::parse(&format!("a.subgroup-name[data-anchor='#{}']", id)).unwrap(),
)
.next()
.map(extract_inner_text_from_element_ref)
.map(|fansub_name| (id.to_string(), fansub_name))
})
.unzip();
tracing::trace!(
bangumi_title,
mikan_bangumi_id,
origin_poster_src = origin_poster_src.as_ref().map(|url| url.as_str()),
fansub_name,
mikan_fansub_id,
"mikan bangumi meta extracted"
);
Ok(MikanBangumiMeta {
homepage: mikan_bangumi_homepage_url,
bangumi_title,
origin_poster_src,
mikan_bangumi_id,
fansub: fansub_name,
mikan_fansub_id,
})
}
/**
* @logined-required
*/
#[instrument(skip_all, fields(my_bangumi_page_url = my_bangumi_page_url.as_str()))]
pub async fn extract_mikan_bangumis_meta_from_my_bangumi_page(
client: Option<&AppMikanClient>,
my_bangumi_page_url: Url,
) -> eyre::Result<Vec<MikanBangumiMeta>> {
let http_client = client.map(|c| c.deref());
let mikan_base_url = Url::parse(&my_bangumi_page_url.origin().unicode_serialization())?;
let content = fetch_html(http_client, my_bangumi_page_url.clone()).await?;
let bangumi_container_selector = &Selector::parse(".sk-bangumi .an-ul>li").unwrap();
let bangumi_info_selector = &Selector::parse(".an-info a.an-text").unwrap();
let bangumi_poster_selector =
&Selector::parse("span[data-src][data-bangumiid], span[data-bangumiid][style]").unwrap();
let fansub_container_selector =
&Selector::parse(".js-expand_bangumi-subgroup.js-subscribed").unwrap();
let fansub_title_selector = &Selector::parse(".tag-res-name[title]").unwrap();
let fansub_id_selector =
&Selector::parse(".active[data-subtitlegroupid][data-bangumiid]").unwrap();
let html = Html::parse_document(&content);
let mut bangumi_list = vec![];
for bangumi_elem in html.select(bangumi_container_selector) {
let title_and_href_elem = bangumi_elem.select(bangumi_info_selector).next();
let poster_elem = bangumi_elem.select(bangumi_poster_selector).next();
if let (Some(bangumi_home_page_url), Some(bangumi_title)) = (
title_and_href_elem.and_then(|elem| elem.attr("href")),
title_and_href_elem.and_then(|elem| elem.attr("title")),
) {
let origin_poster_src = poster_elem.and_then(|ele| {
ele.attr("data-src")
.and_then(|data_src| extract_image_src_from_str(data_src, &mikan_base_url))
.or_else(|| {
ele.attr("style").and_then(|style| {
extract_background_image_src_from_style_attr(style, &mikan_base_url)
})
})
});
let bangumi_home_page_url = my_bangumi_page_url.join(bangumi_home_page_url)?;
if let Some(MikanBangumiHomepage {
ref mikan_bangumi_id,
..
}) = parse_mikan_bangumi_id_from_homepage(&bangumi_home_page_url)
{
if let Some(origin_poster_src) = origin_poster_src.as_ref() {
tracing::trace!(
origin_poster_src = origin_poster_src.as_str(),
bangumi_title,
mikan_bangumi_id,
"bangumi info extracted"
);
} else {
tracing::warn!(
bangumi_title,
mikan_bangumi_id,
"bangumi info extracted, but failed to extract poster_src"
);
}
let bangumi_expand_info_url =
build_mikan_bangumi_expand_info_url(mikan_base_url.clone(), mikan_bangumi_id)?;
let bangumi_expand_info_content =
fetch_html(http_client, bangumi_expand_info_url).await?;
let bangumi_expand_info_fragment =
Html::parse_fragment(&bangumi_expand_info_content);
for fansub_info in bangumi_expand_info_fragment.select(fansub_container_selector) {
if let (Some(fansub_name), Some(mikan_fansub_id)) = (
fansub_info
.select(fansub_title_selector)
.next()
.and_then(|ele| ele.attr("title")),
fansub_info
.select(fansub_id_selector)
.next()
.and_then(|ele| ele.attr("data-subtitlegroupid")),
) {
tracing::trace!(
fansub_name = &fansub_name,
mikan_fansub_id,
"subscribed fansub extracted"
);
bangumi_list.push(MikanBangumiMeta {
homepage: build_mikan_bangumi_homepage(
mikan_base_url.clone(),
mikan_bangumi_id.as_str(),
Some(mikan_fansub_id),
)?,
bangumi_title: bangumi_title.to_string(),
mikan_bangumi_id: mikan_bangumi_id.to_string(),
mikan_fansub_id: Some(mikan_fansub_id.to_string()),
fansub: Some(fansub_name.to_string()),
origin_poster_src: origin_poster_src.clone(),
})
}
}
}
}
}
Ok(bangumi_list)
}
#[cfg(test)]
mod test {
#![allow(unused_variables)]
use color_eyre::eyre;
use rstest::{fixture, rstest};
use tracing::Level;
use url::Url;
use zune_image::{codecs::ImageFormat, image::Image};
use super::*;
use crate::{
extract::mikan::web_extract::extract_mikan_bangumis_meta_from_my_bangumi_page,
test_utils::{mikan::build_testing_mikan_client, tracing::init_testing_tracing},
};
#[fixture]
fn before_each() {
init_testing_tracing(Level::INFO);
}
#[rstest]
#[tokio::test]
async fn test_extract_mikan_poster_from_src(before_each: ()) -> eyre::Result<()> {
let mut mikan_server = mockito::Server::new_async().await;
let mikan_base_url = Url::parse(&mikan_server.url())?;
let mikan_client = build_testing_mikan_client(mikan_base_url.clone())?;
let bangumi_poster_url = mikan_base_url.join("/images/Bangumi/202309/5ce9fed1.jpg")?;
let bangumi_poster_mock = mikan_server
.mock("GET", bangumi_poster_url.path())
.with_body_from_file("tests/resources/mikan/Bangumi-202309-5ce9fed1.jpg")
.create_async()
.await;
let bgm_poster =
extract_mikan_poster_meta_from_src(Some(&mikan_client), bangumi_poster_url).await?;
bangumi_poster_mock.expect(1);
let u8_data = bgm_poster.poster_data.expect("should have poster data");
let image = Image::read(u8_data.to_vec(), Default::default());
assert!(
image.is_ok_and(|img| img
.metadata()
.get_image_format()
.is_some_and(|fmt| matches!(fmt, ImageFormat::JPEG))),
"should start with valid jpeg data magic number"
);
Ok(())
}
#[rstest]
#[tokio::test]
async fn test_extract_mikan_episode(before_each: ()) -> eyre::Result<()> {
let mut mikan_server = mockito::Server::new_async().await;
let mikan_base_url = Url::parse(&mikan_server.url())?;
let mikan_client = build_testing_mikan_client(mikan_base_url.clone())?;
let episode_homepage_url =
mikan_base_url.join("/Home/Episode/475184dce83ea2b82902592a5ac3343f6d54b36a")?;
let episode_homepage_mock = mikan_server
.mock("GET", episode_homepage_url.path())
.with_body_from_file(
"tests/resources/mikan/Episode-475184dce83ea2b82902592a5ac3343f6d54b36a.htm",
)
.create_async()
.await;
let ep_meta = extract_mikan_episode_meta_from_episode_homepage(
Some(&mikan_client),
episode_homepage_url.clone(),
)
.await?;
assert_eq!(ep_meta.homepage, episode_homepage_url);
assert_eq!(ep_meta.bangumi_title, "葬送的芙莉莲");
assert_eq!(
ep_meta
.origin_poster_src
.as_ref()
.map(|s| s.path().to_string()),
Some(String::from("/images/Bangumi/202309/5ce9fed1.jpg"))
);
assert_eq!(ep_meta.fansub, "LoliHouse");
assert_eq!(ep_meta.mikan_fansub_id, "370");
assert_eq!(ep_meta.mikan_bangumi_id, "3141");
Ok(())
}
#[rstest]
#[tokio::test]
async fn test_extract_mikan_bangumi_meta_from_bangumi_homepage(
before_each: (),
) -> eyre::Result<()> {
let mut mikan_server = mockito::Server::new_async().await;
let mikan_base_url = Url::parse(&mikan_server.url())?;
let mikan_client = build_testing_mikan_client(mikan_base_url.clone())?;
let bangumi_homepage_url = mikan_base_url.join("/Home/Bangumi/3416#370")?;
let bangumi_homepage_mock = mikan_server
.mock("GET", bangumi_homepage_url.path())
.with_body_from_file("tests/resources/mikan/Bangumi-3416-370.htm")
.create_async()
.await;
let bgm_meta = extract_mikan_bangumi_meta_from_bangumi_homepage(
Some(&mikan_client),
bangumi_homepage_url.clone(),
)
.await?;
assert_eq!(bgm_meta.homepage, bangumi_homepage_url);
assert_eq!(bgm_meta.bangumi_title, "叹气的亡灵想隐退");
assert_eq!(
bgm_meta
.origin_poster_src
.as_ref()
.map(|s| s.path().to_string()),
Some(String::from("/images/Bangumi/202410/480ef127.jpg"))
);
assert_eq!(bgm_meta.fansub, Some(String::from("LoliHouse")));
assert_eq!(bgm_meta.mikan_fansub_id, Some(String::from("370")));
assert_eq!(bgm_meta.mikan_bangumi_id, "3416");
Ok(())
}
#[rstest]
#[tokio::test]
async fn test_extract_mikan_bangumis_meta_from_my_bangumi_page(
before_each: (),
) -> eyre::Result<()> {
let mut mikan_server = mockito::Server::new_async().await;
let mikan_base_url = Url::parse(&mikan_server.url())?;
let mikan_client = build_testing_mikan_client(mikan_base_url.clone())?;
let my_bangumi_page_url = mikan_base_url.join("/Home/MyBangumi")?;
let mock_my_bangumi = mikan_server
.mock("GET", my_bangumi_page_url.path())
.with_body_from_file("tests/resources/mikan/MyBangumi.htm")
.create_async()
.await;
let mock_expand_bangumi = mikan_server
.mock("GET", "/ExpandBangumi")
.match_query(mockito::Matcher::Any)
.with_body_from_file("tests/resources/mikan/ExpandBangumi.htm")
.create_async()
.await;
let bangumi_metas = extract_mikan_bangumis_meta_from_my_bangumi_page(
Some(&mikan_client),
my_bangumi_page_url,
)
.await?;
assert!(!bangumi_metas.is_empty());
assert!(bangumi_metas[0].origin_poster_src.is_some());
mock_my_bangumi.expect(1);
mock_expand_bangumi.expect(bangumi_metas.len());
Ok(())
}
}

View File

@@ -1,595 +0,0 @@
use std::ops::Deref;
use bytes::Bytes;
use color_eyre::eyre::{self, ContextCompat};
use html_escape::decode_html_entities;
use itertools::Itertools;
use lazy_static::lazy_static;
use lightningcss::{properties::Property, values::image::Image as CSSImage};
use loco_rs::app::AppContext;
use regex::Regex;
use reqwest::IntoUrl;
use scraper::Html;
use url::Url;
use super::{
AppMikanClient, MIKAN_BUCKET_KEY, MikanBangumiRssLink, parse_mikan_bangumi_id_from_rss_link,
};
use crate::{
app::AppContextExt,
dal::DalContentCategory,
extract::html::parse_style_attr,
fetch::{html::fetch_html, image::fetch_image},
};
#[derive(Clone, Debug, PartialEq)]
pub struct MikanEpisodeMeta {
pub homepage: Url,
pub origin_poster_src: Option<Url>,
pub bangumi_title: String,
pub episode_title: String,
pub fansub: String,
pub mikan_bangumi_id: String,
pub mikan_fansub_id: String,
pub mikan_episode_id: String,
}
#[derive(Clone, Debug, PartialEq)]
pub struct MikanBangumiMeta {
pub homepage: Url,
pub origin_poster_src: Option<Url>,
pub bangumi_title: String,
pub mikan_bangumi_id: String,
pub mikan_fansub_id: Option<String>,
pub fansub: Option<String>,
}
#[derive(Clone, Debug, PartialEq)]
pub struct MikanBangumiPosterMeta {
pub origin_poster_src: Url,
pub poster_data: Option<Bytes>,
pub poster_src: Option<String>,
}
#[derive(Clone, Debug, PartialEq)]
pub struct MikanEpisodeHomepage {
pub mikan_episode_id: String,
}
#[derive(Clone, Debug, PartialEq)]
pub struct MikanBangumiHomepage {
pub mikan_bangumi_id: String,
pub mikan_fansub_id: Option<String>,
}
lazy_static! {
static ref MIKAN_TITLE_SEASON: Regex = Regex::new("第.*季").unwrap();
}
pub fn build_mikan_bangumi_homepage(
mikan_base_url: impl IntoUrl,
mikan_bangumi_id: &str,
mikan_fansub_id: Option<&str>,
) -> eyre::Result<Url> {
let mut url = mikan_base_url.into_url()?;
url.set_path(&format!("/Home/Bangumi/{mikan_bangumi_id}"));
url.set_fragment(mikan_fansub_id);
Ok(url)
}
pub fn build_mikan_episode_homepage(
mikan_base_url: impl IntoUrl,
mikan_episode_id: &str,
) -> eyre::Result<Url> {
let mut url = mikan_base_url.into_url()?;
url.set_path(&format!("/Home/Episode/{mikan_episode_id}"));
Ok(url)
}
pub fn build_mikan_bangumi_expand_info_url(
mikan_base_url: impl IntoUrl,
mikan_bangumi_id: &str,
) -> eyre::Result<Url> {
let mut url = mikan_base_url.into_url()?;
url.set_path("/ExpandBangumi");
url.query_pairs_mut()
.append_pair("bangumiId", mikan_bangumi_id)
.append_pair("showSubscribed", "true");
Ok(url)
}
pub fn parse_mikan_bangumi_id_from_homepage(url: &Url) -> Option<MikanBangumiHomepage> {
if url.path().starts_with("/Home/Bangumi/") {
let mikan_bangumi_id = url.path().replace("/Home/Bangumi/", "");
Some(MikanBangumiHomepage {
mikan_bangumi_id,
mikan_fansub_id: url.fragment().map(String::from),
})
} else {
None
}
}
pub fn parse_mikan_episode_id_from_homepage(url: &Url) -> Option<MikanEpisodeHomepage> {
if url.path().starts_with("/Home/Episode/") {
let mikan_episode_id = url.path().replace("/Home/Episode/", "");
Some(MikanEpisodeHomepage { mikan_episode_id })
} else {
None
}
}
pub async fn parse_mikan_bangumi_poster_from_origin_poster_src(
client: Option<&AppMikanClient>,
origin_poster_src_url: Url,
) -> eyre::Result<MikanBangumiPosterMeta> {
let http_client = client.map(|s| s.deref());
let poster_data = fetch_image(http_client, origin_poster_src_url.clone()).await?;
Ok(MikanBangumiPosterMeta {
origin_poster_src: origin_poster_src_url,
poster_data: Some(poster_data),
poster_src: None,
})
}
pub async fn parse_mikan_bangumi_poster_from_origin_poster_src_with_cache(
ctx: &AppContext,
origin_poster_src_url: Url,
subscriber_id: i32,
) -> eyre::Result<MikanBangumiPosterMeta> {
let dal_client = ctx.get_dal_client();
let mikan_client = ctx.get_mikan_client();
if let Some(poster_src) = dal_client
.exists_object(
DalContentCategory::Image,
subscriber_id,
Some(MIKAN_BUCKET_KEY),
&origin_poster_src_url.path().replace("/images/Bangumi/", ""),
)
.await?
{
return Ok(MikanBangumiPosterMeta {
origin_poster_src: origin_poster_src_url,
poster_data: None,
poster_src: Some(poster_src.to_string()),
});
}
let poster_data =
fetch_image(Some(mikan_client.deref()), origin_poster_src_url.clone()).await?;
let poster_str = dal_client
.store_object(
DalContentCategory::Image,
subscriber_id,
Some(MIKAN_BUCKET_KEY),
&origin_poster_src_url.path().replace("/images/Bangumi/", ""),
poster_data.clone(),
)
.await?;
Ok(MikanBangumiPosterMeta {
origin_poster_src: origin_poster_src_url,
poster_data: Some(poster_data),
poster_src: Some(poster_str.to_string()),
})
}
pub fn parse_mikan_origin_poster_src_from_style_attr(
mikan_base_url: impl IntoUrl,
style_attr: &str,
) -> Option<Url> {
let base_url = mikan_base_url.into_url().ok()?;
parse_style_attr(style_attr)
.and_then(|style| {
style.iter().find_map(|(prop, _)| {
match prop {
Property::BackgroundImage(images) => {
for img in images {
if let CSSImage::Url(path) = img {
if let Ok(url) = base_url.join(path.url.trim()) {
return Some(url);
}
}
}
}
Property::Background(backgrounds) => {
for bg in backgrounds {
if let CSSImage::Url(path) = &bg.image {
if let Ok(url) = base_url.join(path.url.trim()) {
return Some(url);
}
}
}
}
_ => {}
}
None
})
})
.map(|mut poster_str| {
poster_str.set_query(None);
poster_str.set_fragment(None);
poster_str
})
}
pub async fn parse_mikan_bangumi_meta_from_mikan_homepage(
client: Option<&AppMikanClient>,
mikan_bangumi_homepage_url: Url,
) -> eyre::Result<MikanBangumiMeta> {
let http_client = client.map(|s| s.deref());
let mikan_base_url = mikan_bangumi_homepage_url.origin().unicode_serialization();
let content = fetch_html(http_client, mikan_bangumi_homepage_url.as_str()).await?;
let html = Html::parse_document(&content);
let bangumi_fansubs = html
.select(&scraper::Selector::parse(".subgroup-text").unwrap())
.filter_map(|el| {
if let (Some(fansub_id), Some(fansub_name)) = (
el.value()
.attr("id")
.map(|s| decode_html_entities(s).trim().to_string()),
el.select(&scraper::Selector::parse("a:nth-child(1)").unwrap())
.next()
.map(|child| {
let mut s = String::from(
child
.prev_sibling()
.and_then(|t| t.value().as_text())
.map(|s| s.trim())
.unwrap_or_default(),
);
s.extend(child.text());
decode_html_entities(&s).trim().to_string()
}),
) {
Some((fansub_id, fansub_name))
} else {
None
}
})
.collect_vec();
let fansub_info = mikan_bangumi_homepage_url.fragment().and_then(|b| {
bangumi_fansubs
.iter()
.find_map(|(id, name)| if id == b { Some((id, name)) } else { None })
});
let bangumi_title = html
.select(&scraper::Selector::parse(".bangumi-title").unwrap())
.next()
.map(|el| {
decode_html_entities(&el.text().collect::<String>())
.trim()
.to_string()
})
.and_then(|title| if title.is_empty() { None } else { Some(title) })
.wrap_err_with(|| {
// todo: error handler
format!(
"Missing mikan bangumi official title for {}",
mikan_bangumi_homepage_url
)
})?;
let MikanBangumiRssLink {
mikan_bangumi_id, ..
} = html
.select(&scraper::Selector::parse(".bangumi-title > .mikan-rss").unwrap())
.next()
.and_then(|el| el.value().attr("href"))
.as_ref()
.and_then(|s| mikan_bangumi_homepage_url.join(s).ok())
.and_then(|rss_link_url| parse_mikan_bangumi_id_from_rss_link(&rss_link_url))
.wrap_err_with(|| {
// todo: error handler
format!(
"Missing mikan bangumi rss link or error format for {}",
mikan_bangumi_homepage_url
)
})?;
let origin_poster_src = html
.select(&scraper::Selector::parse(".bangumi-poster").unwrap())
.next()
.and_then(|el| el.value().attr("style"))
.and_then(|style_attr| {
parse_mikan_origin_poster_src_from_style_attr(&mikan_base_url, style_attr)
});
Ok(MikanBangumiMeta {
homepage: mikan_bangumi_homepage_url,
bangumi_title,
origin_poster_src,
mikan_bangumi_id,
fansub: fansub_info.map(|s| s.1.to_string()),
mikan_fansub_id: fansub_info.map(|s| s.0.to_string()),
})
}
pub async fn parse_mikan_episode_meta_from_mikan_homepage(
client: Option<&AppMikanClient>,
mikan_episode_homepage_url: Url,
) -> eyre::Result<MikanEpisodeMeta> {
let http_client = client.map(|s| s.deref());
let mikan_base_url = mikan_episode_homepage_url.origin().unicode_serialization();
let content = fetch_html(http_client, mikan_episode_homepage_url.as_str()).await?;
let html = Html::parse_document(&content);
let bangumi_title = html
.select(&scraper::Selector::parse(".bangumi-title").unwrap())
.next()
.map(|el| {
decode_html_entities(&el.text().collect::<String>())
.trim()
.to_string()
})
.and_then(|title| if title.is_empty() { None } else { Some(title) })
.wrap_err_with(|| {
// todo: error handler
format!(
"Missing mikan bangumi official title for {}",
mikan_episode_homepage_url
)
})?;
let episode_title = html
.select(&scraper::Selector::parse("title").unwrap())
.next()
.map(|el| {
decode_html_entities(&el.text().collect::<String>())
.replace(" - Mikan Project", "")
.trim()
.to_string()
})
.and_then(|title| if title.is_empty() { None } else { Some(title) })
.wrap_err_with(|| {
// todo: error handler
format!(
"Missing mikan episode official title for {}",
mikan_episode_homepage_url
)
})?;
let (mikan_bangumi_id, mikan_fansub_id) = html
.select(&scraper::Selector::parse(".bangumi-title > .mikan-rss").unwrap())
.next()
.and_then(|el| el.value().attr("href"))
.as_ref()
.and_then(|s| mikan_episode_homepage_url.join(s).ok())
.and_then(|rss_link_url| parse_mikan_bangumi_id_from_rss_link(&rss_link_url))
.and_then(
|MikanBangumiRssLink {
mikan_bangumi_id,
mikan_fansub_id,
..
}| {
mikan_fansub_id.map(|mikan_fansub_id| (mikan_bangumi_id, mikan_fansub_id))
},
)
.wrap_err_with(|| {
// todo: error handler
format!(
"Missing mikan bangumi rss link or error format for {}",
mikan_episode_homepage_url
)
})?;
let fansub = html
.select(&scraper::Selector::parse(".bangumi-info>.magnet-link-wrap").unwrap())
.next()
.map(|el| {
decode_html_entities(&el.text().collect::<String>())
.trim()
.to_string()
})
.wrap_err_with(|| {
// todo: error handler
format!(
"Missing mikan bangumi fansub name for {}",
mikan_episode_homepage_url
)
})?;
let origin_poster_src = html
.select(&scraper::Selector::parse(".bangumi-poster").unwrap())
.next()
.and_then(|el| el.value().attr("style"))
.and_then(|s| parse_mikan_origin_poster_src_from_style_attr(mikan_base_url, s));
let MikanEpisodeHomepage {
mikan_episode_id, ..
} = parse_mikan_episode_id_from_homepage(&mikan_episode_homepage_url).wrap_err_with(|| {
format!(
"Failed to extract mikan_episode_id from {}",
&mikan_episode_homepage_url
)
})?;
Ok(MikanEpisodeMeta {
mikan_bangumi_id,
mikan_fansub_id,
bangumi_title,
episode_title,
homepage: mikan_episode_homepage_url,
origin_poster_src,
fansub,
mikan_episode_id,
})
}
/**
* @logined-required
*/
pub async fn parse_mikan_bangumis_meta_from_my_bangumi_page(
client: Option<&AppMikanClient>,
my_bangumi_page_url: Url,
) -> eyre::Result<Vec<MikanBangumiMeta>> {
let http_client = client.map(|c| c.deref());
let mikan_base_url = my_bangumi_page_url.origin().unicode_serialization();
let content = fetch_html(http_client, my_bangumi_page_url.clone()).await?;
let html = Html::parse_document(&content);
let mut bangumi_list = vec![];
for bangumi_elem in
html.select(&scraper::Selector::parse(".sk-bangumi .an-info a.an-text").unwrap())
{
if let (Some(bangumi_home_page_url), Some(bangumi_title)) =
(bangumi_elem.attr("href"), bangumi_elem.attr("title"))
{
let origin_poster_src = bangumi_elem
.prev_sibling()
.and_then(|ele| ele.value().as_element())
.and_then(|ele| ele.attr("style"))
.and_then(|style_attr| {
parse_mikan_origin_poster_src_from_style_attr(
mikan_base_url.clone(),
style_attr,
)
});
let bangumi_home_page_url = my_bangumi_page_url.join(bangumi_home_page_url)?;
if let Some(MikanBangumiHomepage {
ref mikan_bangumi_id,
..
}) = parse_mikan_bangumi_id_from_homepage(&bangumi_home_page_url)
{
let bangumi_expand_info_url =
build_mikan_bangumi_expand_info_url(mikan_base_url.clone(), mikan_bangumi_id)?;
let bangumi_expand_info_content =
fetch_html(http_client, bangumi_expand_info_url).await?;
let bangumi_expand_info_fragment =
Html::parse_fragment(&bangumi_expand_info_content);
for fansub_info in bangumi_expand_info_fragment.select(
&scraper::Selector::parse("js-expand_bangumi-subgroup.js-subscribed").unwrap(),
) {
if let (Some(fansub_name), Some(mikan_fansub_id)) = (
fansub_info
.select(&scraper::Selector::parse(".tag-res-name[title]").unwrap())
.next()
.and_then(|ele| ele.attr("title")),
fansub_info
.select(
&scraper::Selector::parse(
".active[data-subtitlegroupid][data-bangumiid]",
)
.unwrap(),
)
.next()
.and_then(|ele| ele.attr("data-subtitlegroupid")),
) {
bangumi_list.push(MikanBangumiMeta {
homepage: build_mikan_bangumi_homepage(
mikan_base_url.clone(),
mikan_bangumi_id.as_str(),
Some(mikan_fansub_id),
)?,
bangumi_title: bangumi_title.to_string(),
mikan_bangumi_id: mikan_bangumi_id.to_string(),
mikan_fansub_id: Some(mikan_fansub_id.to_string()),
fansub: Some(fansub_name.to_string()),
origin_poster_src: origin_poster_src.clone(),
})
}
}
}
}
}
Ok(bangumi_list)
}
#[cfg(test)]
mod test {
use std::assert_matches::assert_matches;
use color_eyre::eyre;
use url::Url;
use zune_image::{codecs::ImageFormat, image::Image};
use super::{
parse_mikan_bangumi_meta_from_mikan_homepage,
parse_mikan_bangumi_poster_from_origin_poster_src,
parse_mikan_episode_meta_from_mikan_homepage,
};
#[tokio::test]
async fn test_parse_mikan_episode() {
let test_fn = async || -> eyre::Result<()> {
let url_str =
"https://mikanani.me/Home/Episode/475184dce83ea2b82902592a5ac3343f6d54b36a";
let url = Url::parse(url_str)?;
let ep_meta = parse_mikan_episode_meta_from_mikan_homepage(None, url.clone()).await?;
assert_eq!(ep_meta.homepage, url);
assert_eq!(ep_meta.bangumi_title, "葬送的芙莉莲");
assert_eq!(
ep_meta.origin_poster_src,
Some(Url::parse(
"https://mikanani.me/images/Bangumi/202309/5ce9fed1.jpg"
)?)
);
assert_eq!(ep_meta.fansub, "LoliHouse");
assert_eq!(ep_meta.mikan_fansub_id, "370");
assert_eq!(ep_meta.mikan_bangumi_id, "3141");
assert_matches!(ep_meta.origin_poster_src, Some(..));
let bgm_poster = parse_mikan_bangumi_poster_from_origin_poster_src(
None,
ep_meta.origin_poster_src.unwrap(),
)
.await?;
let u8_data = bgm_poster.poster_data.expect("should have poster data");
let image = Image::read(u8_data.to_vec(), Default::default());
assert!(
image.is_ok_and(|img| img
.metadata()
.get_image_format()
.is_some_and(|fmt| matches!(fmt, ImageFormat::JPEG))),
"should start with valid jpeg data magic number"
);
Ok(())
};
test_fn().await.expect("test parse mikan failed");
}
#[tokio::test]
async fn test_parse_mikan_bangumi() {
let test_fn = async || -> eyre::Result<()> {
let url_str = "https://mikanani.me/Home/Bangumi/3416#370";
let url = Url::parse(url_str)?;
let bgm_meta = parse_mikan_bangumi_meta_from_mikan_homepage(None, url.clone()).await?;
assert_eq!(bgm_meta.homepage, url);
assert_eq!(bgm_meta.bangumi_title, "叹气的亡灵想隐退");
assert_eq!(
bgm_meta.origin_poster_src,
Some(Url::parse(
"https://mikanani.me/images/Bangumi/202410/480ef127.jpg"
)?)
);
assert_eq!(bgm_meta.fansub, Some(String::from("LoliHouse")));
assert_eq!(bgm_meta.mikan_fansub_id, Some(String::from("370")));
assert_eq!(bgm_meta.mikan_bangumi_id, "3416");
assert_eq!(
bgm_meta.homepage.as_str(),
"https://mikanani.me/Home/Bangumi/3416#370"
);
Ok(())
};
test_fn().await.expect("test parse mikan failed");
}
}

View File

@@ -2,6 +2,7 @@ pub mod defs;
pub mod errors;
pub mod html;
pub mod http;
pub mod media;
pub mod mikan;
pub mod rawname;
pub mod torrent;

View File

@@ -1,4 +1,4 @@
use std::{ops::Deref, sync::Arc, time::Duration};
use std::{fmt::Debug, ops::Deref, sync::Arc, time::Duration};
use async_trait::async_trait;
use axum::http::{self, Extensions};
@@ -11,7 +11,7 @@ use reqwest::{ClientBuilder, Request, Response};
use reqwest_middleware::{
ClientBuilder as ClientWithMiddlewareBuilder, ClientWithMiddleware, Next,
};
use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use reqwest_retry::{RetryTransientMiddleware, policies::ExponentialBackoff};
use reqwest_tracing::TracingMiddleware;
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
@@ -101,6 +101,14 @@ pub struct HttpClient {
pub config: HttpClientConfig,
}
impl Debug for HttpClient {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HttpClient")
.field("config", &self.config)
.finish()
}
}
impl From<HttpClient> for ClientWithMiddleware {
fn from(val: HttpClient) -> Self {
val.client

View File

@@ -1,4 +1,9 @@
#![feature(duration_constructors, assert_matches, unboxed_closures)]
#![feature(
duration_constructors,
assert_matches,
unboxed_closures,
impl_trait_in_bindings
)]
pub mod app;
pub mod auth;

View File

@@ -201,7 +201,7 @@ impl ActiveModel {
.ok()
.unwrap_or_default();
let homepage = build_mikan_episode_homepage(
ctx.get_mikan_client().base_url(),
ctx.get_mikan_client().base_url().clone(),
&item.mikan_episode_id,
)?;

View File

@@ -12,11 +12,11 @@ use crate::{
extract::{
mikan::{
build_mikan_bangumi_homepage, build_mikan_bangumi_rss_link,
parse_mikan_bangumi_meta_from_mikan_homepage,
parse_mikan_episode_meta_from_mikan_homepage, parse_mikan_rss_channel_from_rss_link,
web_parser::{
MikanBangumiPosterMeta,
parse_mikan_bangumi_poster_from_origin_poster_src_with_cache,
extract_mikan_bangumi_meta_from_bangumi_homepage,
extract_mikan_episode_meta_from_episode_homepage,
parse_mikan_rss_channel_from_rss_link,
web_extract::{
MikanBangumiPosterMeta, extract_mikan_bangumi_poster_meta_from_src_with_cache,
},
},
rawname::extract_season_from_title_body,
@@ -256,7 +256,7 @@ impl Model {
let mut new_metas = vec![];
for new_rss_item in new_rss_items.iter() {
new_metas.push(
parse_mikan_episode_meta_from_mikan_homepage(
extract_mikan_episode_meta_from_episode_homepage(
Some(mikan_client),
new_rss_item.homepage.clone(),
)
@@ -272,12 +272,12 @@ impl Model {
{
let mikan_base_url = ctx.get_mikan_client().base_url();
let bgm_homepage = build_mikan_bangumi_homepage(
mikan_base_url,
mikan_base_url.clone(),
&mikan_bangumi_id,
Some(&mikan_fansub_id),
)?;
let bgm_rss_link = build_mikan_bangumi_rss_link(
mikan_base_url,
mikan_base_url.clone(),
&mikan_bangumi_id,
Some(&mikan_fansub_id),
)?;
@@ -289,7 +289,7 @@ impl Model {
mikan_bangumi_id.to_string(),
mikan_fansub_id.to_string(),
async |am| -> color_eyre::eyre::Result<()> {
let bgm_meta = parse_mikan_bangumi_meta_from_mikan_homepage(
let bgm_meta = extract_mikan_bangumi_meta_from_bangumi_homepage(
Some(mikan_client),
bgm_homepage.clone(),
)
@@ -306,9 +306,9 @@ impl Model {
am.fansub = ActiveValue::Set(bgm_meta.fansub);
if let Some(origin_poster_src) = bgm_meta.origin_poster_src {
if let MikanBangumiPosterMeta {
poster_src: Some(poster_src),
..
} = parse_mikan_bangumi_poster_from_origin_poster_src_with_cache(
poster_src: Some(poster_src),
..
} = extract_mikan_bangumi_poster_meta_from_src_with_cache(
ctx,
origin_poster_src,
self.subscriber_id,
@@ -318,7 +318,7 @@ impl Model {
am.poster_link = ActiveValue::Set(Some(poster_src))
}
}
Ok(())
Ok(())
},
)
.await?,

View File

@@ -0,0 +1,17 @@
use color_eyre::eyre;
use reqwest::IntoUrl;
use crate::{
extract::mikan::{AppMikanClient, AppMikanConfig},
fetch::HttpClientConfig,
};
pub fn build_testing_mikan_client(base_mikan_url: impl IntoUrl) -> eyre::Result<AppMikanClient> {
let mikan_client = AppMikanClient::new(AppMikanConfig {
http_client: HttpClientConfig {
..Default::default()
},
base_url: base_mikan_url.into_url()?,
})?;
Ok(mikan_client)
}

View File

@@ -1,2 +1,4 @@
pub mod mikan;
#[cfg(feature = "testcontainers")]
pub mod testcontainers;
pub mod tracing;

View File

@@ -0,0 +1,12 @@
use tracing::Level;
use tracing_subscriber::EnvFilter;
pub fn init_testing_tracing(level: Level) {
let crate_name = env!("CARGO_PKG_NAME");
let filter = EnvFilter::new(format!(
"{}[]={}",
crate_name,
level.as_str().to_lowercase()
));
tracing_subscriber::fmt().with_env_filter(filter).init();
}