feature: rewrite season subscription extractor

This commit is contained in:
2025-05-02 02:23:23 +08:00
parent 4301f1dbab
commit dbded94324
51 changed files with 8181 additions and 6035 deletions

View File

@@ -2,7 +2,10 @@ use url::Url;
pub fn extract_image_src_from_str(image_src: &str, base_url: &Url) -> Option<Url> {
let mut image_url = base_url.join(image_src).ok()?;
image_url.set_query(None);
image_url.set_fragment(None);
if let Some((_, value)) = image_url.query_pairs().find(|(key, _)| key == "webp") {
image_url.set_query(Some(&format!("webp={}", value)));
} else {
image_url.set_query(None);
}
Some(image_url)
}

View File

@@ -1,60 +1,204 @@
use std::{fmt::Debug, ops::Deref};
use std::{fmt::Debug, ops::Deref, sync::Arc};
use fetch::{HttpClient, HttpClientTrait, client::HttpClientCookiesAuth};
use fetch::{HttpClient, HttpClientTrait};
use maplit::hashmap;
use sea_orm::DbErr;
use secrecy::SecretBox;
use serde::{Deserialize, Serialize};
use url::Url;
use util::OptDynErr;
use super::MikanConfig;
use crate::errors::RecorderError;
use super::{MikanConfig, constants::MIKAN_ACCOUNT_MANAGE_PAGE_PATH};
use crate::{
app::AppContextTrait,
crypto::UserPassCredential,
errors::{RecorderError, RecorderResult},
extract::mikan::constants::{MIKAN_LOGIN_PAGE_PATH, MIKAN_LOGIN_PAGE_SEARCH},
models::credential_3rd::{self, Credential3rdType},
};
#[derive(Default, Clone, Deserialize, Serialize)]
pub struct MikanAuthSecrecy {
pub cookie: String,
pub user_agent: Option<String>,
pub struct MikanCredentialForm {
pub password: String,
pub username: String,
pub user_agent: String,
}
impl Debug for MikanAuthSecrecy {
pub type MikanAuthSecrecy = SecretBox<MikanCredentialForm>;
impl Debug for MikanCredentialForm {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("MikanAuthSecrecy")
.field("cookie", &String::from("[secrecy]"))
f.debug_struct("MikanCredentialForm")
.field("username", &String::from("[secrecy]"))
.field("password", &String::from("[secrecy]"))
.field("user_agent", &String::from("[secrecy]"))
.finish()
}
}
impl MikanAuthSecrecy {
pub fn into_cookie_auth(self, url: &Url) -> Result<HttpClientCookiesAuth, RecorderError> {
HttpClientCookiesAuth::from_cookies(&self.cookie, url, self.user_agent)
.map_err(RecorderError::from)
}
}
#[derive(Debug)]
pub struct MikanClient {
http_client: HttpClient,
base_url: Url,
origin_url: Url,
userpass_credential: Option<UserPassCredential>,
}
impl MikanClient {
pub async fn from_config(config: MikanConfig) -> Result<Self, RecorderError> {
let http_client = HttpClient::from_config(config.http_client)?;
let base_url = config.base_url;
let origin_url = Url::parse(&base_url.origin().unicode_serialization())?;
Ok(Self {
http_client,
base_url,
origin_url,
userpass_credential: None,
})
}
pub fn fork_with_auth(&self, secrecy: Option<MikanAuthSecrecy>) -> Result<Self, RecorderError> {
let mut fork = self.http_client.fork();
pub async fn has_login(&self) -> RecorderResult<bool> {
let account_manage_page_url = self.base_url.join(MIKAN_ACCOUNT_MANAGE_PAGE_PATH)?;
let res = self.http_client.get(account_manage_page_url).send().await?;
let status = res.status();
if status.is_success() {
Ok(true)
} else if status.is_redirection()
&& res.headers().get("location").is_some_and(|location| {
location
.to_str()
.is_ok_and(|location_str| location_str.contains(MIKAN_LOGIN_PAGE_PATH))
})
{
Ok(false)
} else {
Err(RecorderError::Credential3rdError {
message: format!("mikan account check has login failed, status = {}", status),
source: None.into(),
})
}
}
if let Some(secrecy) = secrecy {
let cookie_auth = secrecy.into_cookie_auth(&self.base_url)?;
fork = fork.attach_secrecy(cookie_auth);
pub async fn login(&self) -> RecorderResult<()> {
let userpass_credential =
self.userpass_credential
.as_ref()
.ok_or_else(|| RecorderError::Credential3rdError {
message: "mikan login failed, credential required".to_string(),
source: None.into(),
})?;
let login_page_url = {
let mut u = self.base_url.join(MIKAN_LOGIN_PAGE_PATH)?;
u.set_query(Some(MIKAN_LOGIN_PAGE_SEARCH));
u
};
// access login page to get antiforgery cookie
self.http_client
.get(login_page_url.clone())
.send()
.await
.map_err(|error| RecorderError::Credential3rdError {
message: "failed to get mikan login page".to_string(),
source: OptDynErr::some_boxed(error),
})?;
let antiforgery_cookie = {
let cookie_store_lock = self.http_client.cookie_store.clone().ok_or_else(|| {
RecorderError::Credential3rdError {
message: "failed to get cookie store".to_string(),
source: None.into(),
}
})?;
let cookie_store =
cookie_store_lock
.read()
.map_err(|_| RecorderError::Credential3rdError {
message: "failed to read cookie store".to_string(),
source: None.into(),
})?;
cookie_store
.matches(&login_page_url)
.iter()
.find(|cookie| cookie.name().starts_with(".AspNetCore.Antiforgery."))
.map(|cookie| cookie.value().to_string())
}
.ok_or_else(|| RecorderError::Credential3rdError {
message: "mikan login failed, failed to get antiforgery cookie".to_string(),
source: None.into(),
})?;
let login_post_form = hashmap! {
"__RequestVerificationToken".to_string() => antiforgery_cookie,
"UserName".to_string() => userpass_credential.username.clone(),
"Password".to_string() => userpass_credential.password.clone(),
"RememberMe".to_string() => "true".to_string(),
};
let login_post_res = self
.http_client
.post(login_page_url.clone())
.form(&login_post_form)
.send()
.await
.map_err(|err| RecorderError::Credential3rdError {
message: "mikan login failed".to_string(),
source: OptDynErr::some_boxed(err),
})?;
if login_post_res.status().is_redirection()
&& login_post_res.headers().contains_key("location")
{
Ok(())
} else {
Err(RecorderError::Credential3rdError {
message: "mikan login failed, no redirecting".to_string(),
source: None.into(),
})
}
}
pub async fn fork_with_credential(
&self,
ctx: Arc<dyn AppContextTrait>,
credential_id: Option<i32>,
) -> RecorderResult<Self> {
let mut fork = self.http_client.fork();
let mut userpass_credential_opt = None;
if let Some(credential_id) = credential_id {
let credential = credential_3rd::Model::find_by_id(ctx.clone(), credential_id).await?;
if let Some(credential) = credential {
if credential.credential_type != Credential3rdType::Mikan {
return Err(RecorderError::Credential3rdError {
message: "credential is not a mikan credential".to_string(),
source: None.into(),
});
}
let userpass_credential: UserPassCredential =
credential.try_into_userpass_credential(ctx)?;
if let Some(cookies) = userpass_credential.cookies.as_ref() {
fork = fork.attach_cookies(cookies)?;
}
if let Some(user_agent) = userpass_credential.user_agent.as_ref() {
fork = fork.attach_user_agent(user_agent);
}
userpass_credential_opt = Some(userpass_credential);
} else {
return Err(RecorderError::from_db_record_not_found(
DbErr::RecordNotFound(format!("credential={} not found", credential_id)),
));
}
}
Ok(Self {
http_client: HttpClient::from_fork(fork)?,
base_url: self.base_url.clone(),
origin_url: self.origin_url.clone(),
userpass_credential: userpass_credential_opt,
})
}

View File

@@ -1,3 +1,6 @@
pub const MIKAN_BUCKET_KEY: &str = "mikan";
pub const MIKAN_UNKNOWN_FANSUB_NAME: &str = "生肉/不明字幕";
pub const MIKAN_UNKNOWN_FANSUB_ID: &str = "202";
pub const MIKAN_LOGIN_PAGE_PATH: &str = "/Account/Login";
pub const MIKAN_LOGIN_PAGE_SEARCH: &str = "?ReturnUrl=%2F";
pub const MIKAN_ACCOUNT_MANAGE_PAGE_PATH: &str = "/Account/Manage";

View File

@@ -4,18 +4,20 @@ pub mod constants;
pub mod rss_extract;
pub mod web_extract;
pub use client::{MikanAuthSecrecy, MikanClient};
pub use client::{MikanClient, MikanCredentialForm};
pub use config::MikanConfig;
pub use constants::MIKAN_BUCKET_KEY;
pub use rss_extract::{
MikanBangumiAggregationRssChannel, MikanBangumiRssChannel, MikanBangumiRssLink,
MikanBangumiAggregationRssChannel, MikanBangumiRssChannel, MikanBangumiRssUrlMeta,
MikanRssChannel, MikanRssItem, MikanSubscriberAggregationRssChannel,
MikanSubscriberAggregationRssLink, build_mikan_bangumi_rss_link,
build_mikan_subscriber_aggregation_rss_link, extract_mikan_bangumi_id_from_rss_link,
MikanSubscriberAggregationRssUrlMeta, build_mikan_bangumi_rss_url,
build_mikan_subscriber_aggregation_rss_url, extract_mikan_bangumi_id_from_rss_url,
extract_mikan_rss_channel_from_rss_link, extract_mikan_subscriber_aggregation_id_from_rss_link,
};
pub use web_extract::{
MikanBangumiMeta, MikanEpisodeMeta, build_mikan_bangumi_homepage, build_mikan_episode_homepage,
MikanBangumiMeta, MikanEpisodeMeta, MikanSeasonStr, build_mikan_bangumi_homepage_url,
build_mikan_episode_homepage_url, build_mikan_season_flow_url,
extract_mikan_bangumi_indices_meta_from_season_flow_fragment,
extract_mikan_bangumi_meta_from_bangumi_homepage,
extract_mikan_episode_meta_from_episode_homepage,
};

View File

@@ -12,7 +12,7 @@ use crate::{
errors::app_error::{RecorderError, RecorderResult},
extract::mikan::{
MikanClient,
web_extract::{MikanEpisodeHomepage, extract_mikan_episode_id_from_homepage},
web_extract::{MikanEpisodeHomepage, extract_mikan_episode_id_from_homepage_url},
},
};
@@ -135,7 +135,7 @@ impl TryFrom<rss::Item> for MikanRssItem {
let MikanEpisodeHomepage {
mikan_episode_id, ..
} = extract_mikan_episode_id_from_homepage(&homepage).ok_or_else(|| {
} = extract_mikan_episode_id_from_homepage_url(&homepage).ok_or_else(|| {
RecorderError::from_mikan_rss_invalid_field(Cow::Borrowed("mikan_episode_id"))
})?;
@@ -155,17 +155,17 @@ impl TryFrom<rss::Item> for MikanRssItem {
}
#[derive(Debug, Clone)]
pub struct MikanBangumiRssLink {
pub struct MikanBangumiRssUrlMeta {
pub mikan_bangumi_id: String,
pub mikan_fansub_id: Option<String>,
}
#[derive(Debug, Clone)]
pub struct MikanSubscriberAggregationRssLink {
pub struct MikanSubscriberAggregationRssUrlMeta {
pub mikan_aggregation_id: String,
}
pub fn build_mikan_bangumi_rss_link(
pub fn build_mikan_bangumi_rss_url(
mikan_base_url: impl IntoUrl,
mikan_bangumi_id: &str,
mikan_fansub_id: Option<&str>,
@@ -181,7 +181,7 @@ pub fn build_mikan_bangumi_rss_link(
Ok(url)
}
pub fn build_mikan_subscriber_aggregation_rss_link(
pub fn build_mikan_subscriber_aggregation_rss_url(
mikan_base_url: &str,
mikan_aggregation_id: &str,
) -> RecorderResult<Url> {
@@ -192,11 +192,11 @@ pub fn build_mikan_subscriber_aggregation_rss_link(
Ok(url)
}
pub fn extract_mikan_bangumi_id_from_rss_link(url: &Url) -> Option<MikanBangumiRssLink> {
pub fn extract_mikan_bangumi_id_from_rss_url(url: &Url) -> Option<MikanBangumiRssUrlMeta> {
if url.path() == "/RSS/Bangumi" {
url.query_pairs()
.find(|(k, _)| k == "bangumiId")
.map(|(_, v)| MikanBangumiRssLink {
.map(|(_, v)| MikanBangumiRssUrlMeta {
mikan_bangumi_id: v.to_string(),
mikan_fansub_id: url
.query_pairs()
@@ -210,10 +210,10 @@ pub fn extract_mikan_bangumi_id_from_rss_link(url: &Url) -> Option<MikanBangumiR
pub fn extract_mikan_subscriber_aggregation_id_from_rss_link(
url: &Url,
) -> Option<MikanSubscriberAggregationRssLink> {
) -> Option<MikanSubscriberAggregationRssUrlMeta> {
if url.path() == "/RSS/MyBangumi" {
url.query_pairs().find(|(k, _)| k == "token").map(|(_, v)| {
MikanSubscriberAggregationRssLink {
MikanSubscriberAggregationRssUrlMeta {
mikan_aggregation_id: v.to_string(),
}
})
@@ -233,10 +233,10 @@ pub async fn extract_mikan_rss_channel_from_rss_link(
let channel_link = Url::parse(channel.link())?;
if let Some(MikanBangumiRssLink {
if let Some(MikanBangumiRssUrlMeta {
mikan_bangumi_id,
mikan_fansub_id,
}) = extract_mikan_bangumi_id_from_rss_link(&channel_link)
}) = extract_mikan_bangumi_id_from_rss_url(&channel_link)
{
tracing::trace!(
mikan_bangumi_id,
@@ -290,7 +290,7 @@ pub async fn extract_mikan_rss_channel_from_rss_link(
},
))
}
} else if let Some(MikanSubscriberAggregationRssLink {
} else if let Some(MikanSubscriberAggregationRssUrlMeta {
mikan_aggregation_id,
..
}) = extract_mikan_subscriber_aggregation_id_from_rss_link(&channel_link)

View File

@@ -1,22 +1,19 @@
use std::{borrow::Cow, sync::Arc};
use std::{borrow::Cow, fmt};
use async_stream::try_stream;
use bytes::Bytes;
use fetch::{html::fetch_html, image::fetch_image};
use futures::Stream;
use itertools::Itertools;
use html_escape::decode_html_entities;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use tracing::instrument;
use url::Url;
use super::{
MIKAN_BUCKET_KEY, MikanAuthSecrecy, MikanBangumiRssLink, MikanClient,
extract_mikan_bangumi_id_from_rss_link,
MIKAN_BUCKET_KEY, MikanBangumiRssUrlMeta, MikanClient, extract_mikan_bangumi_id_from_rss_url,
};
use crate::{
app::AppContextTrait,
errors::app_error::{RecorderResult, RecorderError},
errors::app_error::{RecorderError, RecorderResult},
extract::{
html::{extract_background_image_src_from_style_attr, extract_inner_text_from_element_ref},
media::extract_image_src_from_str,
@@ -24,6 +21,29 @@ use crate::{
storage::StorageContentCategory,
};
#[derive(Clone, Debug, Copy, Serialize, Deserialize)]
pub enum MikanSeasonStr {
#[serde(rename = "")]
Spring,
#[serde(rename = "")]
Summer,
#[serde(rename = "")]
Autumn,
#[serde(rename = "")]
Winter,
}
impl fmt::Display for MikanSeasonStr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Spring => write!(f, ""),
Self::Summer => write!(f, ""),
Self::Autumn => write!(f, ""),
Self::Winter => write!(f, ""),
}
}
}
#[derive(Clone, Debug, PartialEq)]
pub struct MikanEpisodeMeta {
pub homepage: Url,
@@ -36,6 +56,14 @@ pub struct MikanEpisodeMeta {
pub mikan_episode_id: String,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct MikanBangumiIndexMeta {
pub homepage: Url,
pub origin_poster_src: Option<Url>,
pub bangumi_title: String,
pub mikan_bangumi_id: String,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct MikanBangumiMeta {
pub homepage: Url,
@@ -53,6 +81,19 @@ pub struct MikanBangumiPosterMeta {
pub poster_src: Option<String>,
}
impl From<MikanBangumiIndexMeta> for MikanBangumiMeta {
fn from(index_meta: MikanBangumiIndexMeta) -> Self {
MikanBangumiMeta {
homepage: index_meta.homepage,
origin_poster_src: index_meta.origin_poster_src,
bangumi_title: index_meta.bangumi_title,
mikan_bangumi_id: index_meta.mikan_bangumi_id,
mikan_fansub_id: None,
fansub: None,
}
}
}
#[derive(Clone, Debug, PartialEq)]
pub struct MikanEpisodeHomepage {
pub mikan_episode_id: String,
@@ -64,7 +105,7 @@ pub struct MikanBangumiHomepage {
pub mikan_fansub_id: Option<String>,
}
pub fn build_mikan_bangumi_homepage(
pub fn build_mikan_bangumi_homepage_url(
mikan_base_url: Url,
mikan_bangumi_id: &str,
mikan_fansub_id: Option<&str>,
@@ -75,13 +116,29 @@ pub fn build_mikan_bangumi_homepage(
url
}
pub fn build_mikan_episode_homepage(mikan_base_url: Url, mikan_episode_id: &str) -> Url {
pub fn build_mikan_season_flow_url(
mikan_base_url: Url,
year: i32,
season_str: MikanSeasonStr,
) -> Url {
let mut url = mikan_base_url;
url.set_path("/Home/BangumiCoverFlow");
url.query_pairs_mut()
.append_pair("year", &year.to_string())
.append_pair("seasonStr", &season_str.to_string());
url
}
pub fn build_mikan_episode_homepage_url(mikan_base_url: Url, mikan_episode_id: &str) -> Url {
let mut url = mikan_base_url;
url.set_path(&format!("/Home/Episode/{mikan_episode_id}"));
url
}
pub fn build_mikan_bangumi_expand_info_url(mikan_base_url: Url, mikan_bangumi_id: &str) -> Url {
pub fn build_mikan_bangumi_expand_subscribed_fragment_url(
mikan_base_url: Url,
mikan_bangumi_id: &str,
) -> Url {
let mut url = mikan_base_url;
url.set_path("/ExpandBangumi");
url.query_pairs_mut()
@@ -90,7 +147,7 @@ pub fn build_mikan_bangumi_expand_info_url(mikan_base_url: Url, mikan_bangumi_id
url
}
pub fn extract_mikan_bangumi_id_from_homepage(url: &Url) -> Option<MikanBangumiHomepage> {
pub fn extract_mikan_bangumi_id_from_homepage_url(url: &Url) -> Option<MikanBangumiHomepage> {
if url.path().starts_with("/Home/Bangumi/") {
let mikan_bangumi_id = url.path().replace("/Home/Bangumi/", "");
@@ -103,7 +160,7 @@ pub fn extract_mikan_bangumi_id_from_homepage(url: &Url) -> Option<MikanBangumiH
}
}
pub fn extract_mikan_episode_id_from_homepage(url: &Url) -> Option<MikanEpisodeHomepage> {
pub fn extract_mikan_episode_id_from_homepage_url(url: &Url) -> Option<MikanEpisodeHomepage> {
if url.path().starts_with("/Home/Episode/") {
let mikan_episode_id = url.path().replace("/Home/Episode/", "");
Some(MikanEpisodeHomepage { mikan_episode_id })
@@ -191,7 +248,7 @@ pub async fn extract_mikan_episode_meta_from_episode_homepage(
tracing::warn!(error = %error);
})?;
let MikanBangumiRssLink {
let MikanBangumiRssUrlMeta {
mikan_bangumi_id,
mikan_fansub_id,
..
@@ -200,7 +257,7 @@ pub async fn extract_mikan_episode_meta_from_episode_homepage(
.next()
.and_then(|el| el.value().attr("href"))
.and_then(|s| mikan_episode_homepage_url.join(s).ok())
.and_then(|rss_link_url| extract_mikan_bangumi_id_from_rss_link(&rss_link_url))
.and_then(|rss_link_url| extract_mikan_bangumi_id_from_rss_url(&rss_link_url))
.ok_or_else(|| {
RecorderError::from_mikan_meta_missing_field(Cow::Borrowed("mikan_bangumi_id"))
})
@@ -223,7 +280,7 @@ pub async fn extract_mikan_episode_meta_from_episode_homepage(
let MikanEpisodeHomepage {
mikan_episode_id, ..
} = extract_mikan_episode_id_from_homepage(&mikan_episode_homepage_url)
} = extract_mikan_episode_id_from_homepage_url(&mikan_episode_homepage_url)
.ok_or_else(|| {
RecorderError::from_mikan_meta_missing_field(Cow::Borrowed("mikan_episode_id"))
})
@@ -303,9 +360,9 @@ pub async fn extract_mikan_bangumi_meta_from_bangumi_homepage(
.next()
.and_then(|el| el.value().attr("href"))
.and_then(|s| mikan_bangumi_homepage_url.join(s).ok())
.and_then(|rss_link_url| extract_mikan_bangumi_id_from_rss_link(&rss_link_url))
.and_then(|rss_link_url| extract_mikan_bangumi_id_from_rss_url(&rss_link_url))
.map(
|MikanBangumiRssLink {
|MikanBangumiRssUrlMeta {
mikan_bangumi_id, ..
}| mikan_bangumi_id,
)
@@ -325,7 +382,7 @@ pub async fn extract_mikan_bangumi_meta_from_bangumi_homepage(
})
});
let (mikan_fansub_id, fansub_name) = mikan_bangumi_homepage_url
let (mikan_fansub_id, fansub) = mikan_bangumi_homepage_url
.fragment()
.and_then(|id| {
html.select(
@@ -341,7 +398,7 @@ pub async fn extract_mikan_bangumi_meta_from_bangumi_homepage(
bangumi_title,
mikan_bangumi_id,
origin_poster_src = origin_poster_src.as_ref().map(|url| url.as_str()),
fansub_name,
fansub,
mikan_fansub_id,
"mikan bangumi meta extracted"
);
@@ -351,154 +408,141 @@ pub async fn extract_mikan_bangumi_meta_from_bangumi_homepage(
bangumi_title,
origin_poster_src,
mikan_bangumi_id,
fansub: fansub_name,
fansub,
mikan_fansub_id,
})
}
#[instrument(skip_all, fields(my_bangumi_page_url, auth_secrecy = ?auth_secrecy, history = history.len()))]
pub fn extract_mikan_bangumis_meta_from_my_bangumi_page(
context: Arc<dyn AppContextTrait>,
my_bangumi_page_url: Url,
auth_secrecy: Option<MikanAuthSecrecy>,
history: &[Arc<RecorderResult<MikanBangumiMeta>>],
) -> impl Stream<Item = RecorderResult<MikanBangumiMeta>> {
try_stream! {
let http_client = &context.mikan().fork_with_auth(auth_secrecy.clone())?;
#[instrument]
pub fn extract_mikan_bangumi_indices_meta_from_season_flow_fragment(
season_flow_fragment: &str,
mikan_base_url: Url,
) -> Vec<MikanBangumiIndexMeta> {
let html = Html::parse_fragment(season_flow_fragment);
let mikan_base_url = Url::parse(&my_bangumi_page_url.origin().unicode_serialization())?;
let bangumi_empty_selector = &Selector::parse(".no-subscribe-bangumi").unwrap();
let content = fetch_html(http_client, my_bangumi_page_url.clone()).await?;
if html.select(bangumi_empty_selector).next().is_some() {
return vec![];
}
let fansub_container_selector =
&Selector::parse(".js-expand_bangumi-subgroup.js-subscribed").unwrap();
let fansub_title_selector = &Selector::parse(".tag-res-name[title]").unwrap();
let fansub_id_selector =
&Selector::parse(".active[data-subtitlegroupid][data-bangumiid]").unwrap();
let bangumi_item_selector = &Selector::parse(".mine.an-box ul.an-ul>li").unwrap();
let bangumi_poster_span_selector = &Selector::parse("span[data-src][data-bangumiid]").unwrap();
let bangumi_title_a_selector = &Selector::parse(".an-info-group a.an-text[title]").unwrap();
let bangumi_items = {
let html = Html::parse_document(&content);
let mut items = vec![];
for bangumi_item in html.select(bangumi_item_selector) {
let bangumi_poster_span = bangumi_item.select(bangumi_poster_span_selector).next();
let bangumi_title_a = bangumi_item.select(bangumi_title_a_selector).next();
if let (Some(bangumi_poster_span), Some(bangumi_title_a)) =
(bangumi_poster_span, bangumi_title_a)
{
let origin_poster_src = bangumi_poster_span
.attr("data-src")
.and_then(|data_src| extract_image_src_from_str(data_src, &mikan_base_url));
let bangumi_title = bangumi_title_a
.attr("title")
.map(|title| decode_html_entities(&title).trim().to_string());
let mikan_bangumi_id = bangumi_poster_span
.attr("data-bangumiid")
.map(|id| id.to_string());
let bangumi_container_selector = &Selector::parse(".sk-bangumi .an-ul>li").unwrap();
let bangumi_info_selector = &Selector::parse(".an-info a.an-text").unwrap();
let bangumi_poster_selector =
&Selector::parse("span[data-src][data-bangumiid], span[data-bangumiid][style]")
.unwrap();
html.select(bangumi_container_selector)
.filter_map(|bangumi_elem| {
let title_and_href_elem =
bangumi_elem.select(bangumi_info_selector).next();
let poster_elem = bangumi_elem.select(bangumi_poster_selector).next();
if let (Some(bangumi_home_page_url), Some(bangumi_title)) = (
title_and_href_elem.and_then(|elem| elem.attr("href")),
title_and_href_elem.and_then(|elem| elem.attr("title")),
) {
let origin_poster_src = poster_elem.and_then(|ele| {
ele.attr("data-src")
.and_then(|data_src| {
extract_image_src_from_str(data_src, &mikan_base_url)
})
.or_else(|| {
ele.attr("style").and_then(|style| {
extract_background_image_src_from_style_attr(
style,
&mikan_base_url,
)
})
})
});
let bangumi_title = bangumi_title.to_string();
let bangumi_home_page_url =
my_bangumi_page_url.join(bangumi_home_page_url).ok()?;
let MikanBangumiHomepage {
mikan_bangumi_id, ..
} = extract_mikan_bangumi_id_from_homepage(&bangumi_home_page_url)?;
if let Some(origin_poster_src) = origin_poster_src.as_ref() {
tracing::trace!(
origin_poster_src = origin_poster_src.as_str(),
bangumi_title,
mikan_bangumi_id,
"bangumi info extracted"
);
} else {
tracing::warn!(
bangumi_title,
mikan_bangumi_id,
"bangumi info extracted, but failed to extract poster_src"
);
}
let bangumi_expand_info_url = build_mikan_bangumi_expand_info_url(
mikan_base_url.clone(),
&mikan_bangumi_id,
);
Some((
if let (Some(bangumi_title), Some(mikan_bangumi_id)) = (bangumi_title, mikan_bangumi_id)
{
let homepage = build_mikan_bangumi_homepage_url(
mikan_base_url.clone(),
&mikan_bangumi_id,
None,
);
if let Some(origin_poster_src) = origin_poster_src.as_ref() {
tracing::trace!(
origin_poster_src = origin_poster_src.as_str(),
bangumi_title,
mikan_bangumi_id,
bangumi_expand_info_url,
origin_poster_src,
))
"bangumi index meta extracted"
);
} else {
tracing::warn!(
bangumi_title,
mikan_bangumi_id,
"bangumi index meta extracted, but failed to extract poster_src"
);
}
items.push(MikanBangumiIndexMeta {
homepage,
origin_poster_src,
bangumi_title,
mikan_bangumi_id,
})
}
}
}
items
}
#[instrument(skip_all, fields(mikan_bangumi_index = mikan_bangumi_index.mikan_bangumi_id.as_str()))]
pub fn extract_mikan_bangumi_meta_from_expand_subscribed_fragment(
mikan_bangumi_index: MikanBangumiIndexMeta,
expand_subscribed_fragment: &str,
mikan_base_url: Url,
) -> Option<MikanBangumiMeta> {
let html = Html::parse_fragment(expand_subscribed_fragment);
let fansub_container_selector =
&Selector::parse(".js-expand_bangumi-subgroup.js-subscribed").unwrap();
let fansub_title_selector = &Selector::parse(".tag-res-name[title]").unwrap();
let fansub_id_selector =
&Selector::parse(".active[data-subtitlegroupid][data-bangumiid]").unwrap();
if let Some((fansub_name, mikan_fansub_id)) = {
html.select(fansub_container_selector)
.next()
.and_then(|fansub_info| {
if let (Some(fansub_name), Some(mikan_fansub_id)) = (
fansub_info
.select(fansub_title_selector)
.next()
.and_then(|ele| ele.attr("title"))
.map(String::from),
fansub_info
.select(fansub_id_selector)
.next()
.and_then(|ele| ele.attr("data-subtitlegroupid"))
.map(String::from),
) {
Some((fansub_name, mikan_fansub_id))
} else {
None
}
})
.collect_vec()
};
} {
tracing::trace!(fansub_name, mikan_fansub_id, "subscribed fansub extracted");
let mikan_bangumi_id = mikan_bangumi_index.mikan_bangumi_id;
let bangumi_title = mikan_bangumi_index.bangumi_title;
let origin_poster_src = mikan_bangumi_index.origin_poster_src;
for (idx, (bangumi_title, mikan_bangumi_id, bangumi_expand_info_url, origin_poster_src)) in
bangumi_items.iter().enumerate()
{
if history.get(idx).is_some() {
continue;
} else if let Some((fansub_name, mikan_fansub_id)) = {
let bangumi_expand_info_content =
fetch_html(http_client, bangumi_expand_info_url.clone()).await?;
let bangumi_expand_info_fragment =
Html::parse_fragment(&bangumi_expand_info_content);
bangumi_expand_info_fragment
.select(fansub_container_selector)
.next()
.and_then(|fansub_info| {
if let (Some(fansub_name), Some(mikan_fansub_id)) = (
fansub_info
.select(fansub_title_selector)
.next()
.and_then(|ele| ele.attr("title"))
.map(String::from),
fansub_info
.select(fansub_id_selector)
.next()
.and_then(|ele| ele.attr("data-subtitlegroupid"))
.map(String::from),
) {
Some((fansub_name, mikan_fansub_id))
} else {
None
}
})
} {
tracing::trace!(fansub_name, mikan_fansub_id, "subscribed fansub extracted");
let item = MikanBangumiMeta {
homepage: build_mikan_bangumi_homepage(
mikan_base_url.clone(),
mikan_bangumi_id,
Some(&mikan_fansub_id),
),
bangumi_title: bangumi_title.to_string(),
mikan_bangumi_id: mikan_bangumi_id.to_string(),
mikan_fansub_id: Some(mikan_fansub_id),
fansub: Some(fansub_name),
origin_poster_src: origin_poster_src.clone(),
};
yield item;
}
}
Some(MikanBangumiMeta {
homepage: build_mikan_bangumi_homepage_url(
mikan_base_url.clone(),
&mikan_bangumi_id,
Some(&mikan_fansub_id),
),
bangumi_title: bangumi_title.to_string(),
mikan_bangumi_id: mikan_bangumi_id.to_string(),
mikan_fansub_id: Some(mikan_fansub_id),
fansub: Some(fansub_name),
origin_poster_src: origin_poster_src.clone(),
})
} else {
tracing::trace!("subscribed fansub not found");
None
}
}
#[cfg(test)]
mod test {
#![allow(unused_variables)]
use std::{fs, sync::Arc};
use futures::{TryStreamExt, pin_mut};
use http::header;
use rstest::{fixture, rstest};
@@ -507,9 +551,12 @@ mod test {
use zune_image::{codecs::ImageFormat, image::Image};
use super::*;
use crate::test_utils::{
app::UnitTestAppContext, mikan::build_testing_mikan_client,
tracing::try_init_testing_tracing,
use crate::{
extract::mikan::MikanCredentialForm,
test_utils::{
app::UnitTestAppContext, mikan::build_testing_mikan_client,
tracing::try_init_testing_tracing,
},
};
#[fixture]
@@ -590,7 +637,9 @@ mod test {
#[rstest]
#[tokio::test]
async fn test_extract_mikan_bangumi_meta_from_bangumi_homepage(before_each: ()) -> RecorderResult<()> {
async fn test_extract_mikan_bangumi_meta_from_bangumi_homepage(
before_each: (),
) -> RecorderResult<()> {
let mut mikan_server = mockito::Server::new_async().await;
let mikan_base_url = Url::parse(&mikan_server.url())?;
let mikan_client = build_testing_mikan_client(mikan_base_url.clone()).await?;
@@ -626,95 +675,217 @@ mod test {
}
#[rstest]
#[tokio::test]
async fn test_extract_mikan_bangumis_meta_from_my_bangumi_page(before_each: ()) -> RecorderResult<()> {
let mut mikan_server = mockito::Server::new_async().await;
#[test]
fn test_extract_mikan_bangumi_indices_meta_from_season_flow_fragment(
before_each: (),
) -> RecorderResult<()> {
let fragment =
fs::read_to_string("tests/resources/mikan/BangumiCoverFlow-2025-spring.html")?;
let mikan_base_url = Url::parse(&mikan_server.url())?;
let my_bangumi_page_url = mikan_base_url.join("/Home/MyBangumi")?;
let context = Arc::new(
UnitTestAppContext::builder()
.mikan(build_testing_mikan_client(mikan_base_url.clone()).await?)
.build(),
let indices = extract_mikan_bangumi_indices_meta_from_season_flow_fragment(
&fragment,
Url::parse("https://mikanani.me/")?,
);
{
let my_bangumi_without_cookie_mock = mikan_server
.mock("GET", my_bangumi_page_url.path())
.match_header(header::COOKIE, mockito::Matcher::Missing)
.with_body_from_file("tests/resources/mikan/MyBangumi-noauth.htm")
.create_async()
.await;
tracing::info!("indices: {:#?}", &indices[0]);
let bangumi_metas = extract_mikan_bangumis_meta_from_my_bangumi_page(
context.clone(),
my_bangumi_page_url.clone(),
None,
&[],
);
pin_mut!(bangumi_metas);
let bangumi_metas = bangumi_metas.try_collect::<Vec<_>>().await?;
assert!(bangumi_metas.is_empty());
assert!(my_bangumi_without_cookie_mock.matched_async().await);
}
{
let my_bangumi_with_cookie_mock = mikan_server
.mock("GET", my_bangumi_page_url.path())
.match_header(
header::COOKIE,
mockito::Matcher::AllOf(vec![
mockito::Matcher::Regex(String::from(".*\\.AspNetCore\\.Antiforgery.*")),
mockito::Matcher::Regex(String::from(
".*\\.AspNetCore\\.Identity\\.Application.*",
)),
]),
)
.with_body_from_file("tests/resources/mikan/MyBangumi.htm")
.create_async()
.await;
let expand_bangumi_mock = mikan_server
.mock("GET", "/ExpandBangumi")
.match_query(mockito::Matcher::Any)
.with_body_from_file("tests/resources/mikan/ExpandBangumi.htm")
.create_async()
.await;
let auth_secrecy = Some(MikanAuthSecrecy {
cookie: String::from(
"mikan-announcement=1; .AspNetCore.Antiforgery.abc=abc; \
.AspNetCore.Identity.Application=abc; ",
),
user_agent: Some(String::from(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like \
Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0",
)),
});
let bangumi_metas = extract_mikan_bangumis_meta_from_my_bangumi_page(
context.clone(),
my_bangumi_page_url,
auth_secrecy,
&[],
);
pin_mut!(bangumi_metas);
let bangumi_metas = bangumi_metas.try_collect::<Vec<_>>().await?;
assert!(!bangumi_metas.is_empty());
assert!(bangumi_metas[0].origin_poster_src.is_some());
assert!(my_bangumi_with_cookie_mock.matched_async().await);
expand_bangumi_mock.expect(bangumi_metas.len());
}
assert_eq!(indices.len(), 49);
let first = &indices[0];
assert_eq!(first.bangumi_title, "吉伊卡哇");
assert_eq!(first.mikan_bangumi_id, "3288");
assert_eq!(
first.homepage.to_string(),
String::from("https://mikanani.me/Home/Bangumi/3288")
);
assert_eq!(
first
.origin_poster_src
.as_ref()
.map(|s| s.to_string())
.unwrap_or_default(),
String::from("https://mikanani.me/images/Bangumi/202204/d8ef46c0.jpg")
);
Ok(())
}
#[rstest]
#[test]
fn test_extract_mikan_bangumi_indices_meta_from_season_flow_fragment_noauth(
before_each: (),
) -> RecorderResult<()> {
let fragment =
fs::read_to_string("tests/resources/mikan/BangumiCoverFlow-2025-spring-noauth.html")?;
let indices = extract_mikan_bangumi_indices_meta_from_season_flow_fragment(
&fragment,
Url::parse("https://mikanani.me/")?,
);
assert!(indices.is_empty());
Ok(())
}
#[rstest]
#[test]
fn test_extract_mikan_bangumi_meta_from_expand_subscribed_fragment(
before_each: (),
) -> RecorderResult<()> {
let origin_poster_src =
Url::parse("https://mikanani.me/images/Bangumi/202504/076c1094.jpg")?;
let bangumi_index = MikanBangumiIndexMeta {
homepage: Url::parse("https://mikanani.me/Home/Bangumi/3599")?,
origin_poster_src: Some(origin_poster_src.clone()),
bangumi_title: "夏日口袋".to_string(),
mikan_bangumi_id: "3599".to_string(),
};
let fragment = fs::read_to_string("tests/resources/mikan/ExpandBangumi-3599.html")?;
let bangumi = extract_mikan_bangumi_meta_from_expand_subscribed_fragment(
bangumi_index.clone(),
&fragment,
Url::parse("https://mikanani.me/")?,
)
.unwrap_or_else(|| {
panic!("bangumi should not be None");
});
assert_eq!(
bangumi.homepage,
Url::parse("https://mikanani.me/Home/Bangumi/3599#370")?
);
assert_eq!(bangumi.bangumi_title, bangumi_index.bangumi_title);
assert_eq!(bangumi.mikan_bangumi_id, bangumi_index.mikan_bangumi_id);
assert_eq!(bangumi.origin_poster_src, bangumi_index.origin_poster_src);
assert_eq!(bangumi.mikan_fansub_id, Some(String::from("370")));
assert_eq!(bangumi.fansub, Some(String::from("LoliHouse")));
Ok(())
}
#[rstest]
#[test]
fn test_extract_mikan_bangumi_meta_from_expand_subscribed_fragment_noauth(
before_each: (),
) -> RecorderResult<()> {
let origin_poster_src =
Url::parse("https://mikanani.me/images/Bangumi/202504/076c1094.jpg")?;
let bangumi_index = MikanBangumiIndexMeta {
homepage: Url::parse("https://mikanani.me/Home/Bangumi/3599")?,
origin_poster_src: Some(origin_poster_src.clone()),
bangumi_title: "夏日口袋".to_string(),
mikan_bangumi_id: "3599".to_string(),
};
let fragment = fs::read_to_string("tests/resources/mikan/ExpandBangumi-3599-noauth.html")?;
let bangumi = extract_mikan_bangumi_meta_from_expand_subscribed_fragment(
bangumi_index.clone(),
&fragment,
Url::parse("https://mikanani.me/")?,
);
assert!(bangumi.is_none());
Ok(())
}
// #[rstest]
// #[tokio::test]
// async fn test_extract_mikan_bangumis_meta_from_my_bangumi_page(
// before_each: (),
// ) -> RecorderResult<()> {
// let mut mikan_server = mockito::Server::new_async().await;
// let mikan_base_url = Url::parse(&mikan_server.url())?;
// let my_bangumi_page_url = mikan_base_url.join("/Home/MyBangumi")?;
// let context = Arc::new(
// UnitTestAppContext::builder()
//
// .mikan(build_testing_mikan_client(mikan_base_url.clone()).await?)
// .build(),
// );
// {
// let my_bangumi_without_cookie_mock = mikan_server
// .mock("GET", my_bangumi_page_url.path())
// .match_header(header::COOKIE, mockito::Matcher::Missing)
//
// .with_body_from_file("tests/resources/mikan/MyBangumi-noauth.htm")
// .create_async()
// .await;
// let bangumi_metas =
// extract_mikan_bangumis_meta_from_my_bangumi_page(
// context.clone(), my_bangumi_page_url.clone(),
// None,
// &[],
// );
// pin_mut!(bangumi_metas);
// let bangumi_metas = bangumi_metas.try_collect::<Vec<_>>().await?;
// assert!(bangumi_metas.is_empty());
// assert!(my_bangumi_without_cookie_mock.matched_async().await);
// }
// {
// let my_bangumi_with_cookie_mock = mikan_server
// .mock("GET", my_bangumi_page_url.path())
// .match_header(
// header::COOKIE,
// mockito::Matcher::AllOf(vec![
//
// mockito::Matcher::Regex(String::from(".*\\.AspNetCore\\.Antiforgery.*")),
// mockito::Matcher::Regex(String::from(
// ".*\\.AspNetCore\\.Identity\\.Application.*",
// )),
// ]),
// )
// .with_body_from_file("tests/resources/mikan/MyBangumi.htm")
// .create_async()
// .await;
// let expand_bangumi_mock = mikan_server
// .mock("GET", "/ExpandBangumi")
// .match_query(mockito::Matcher::Any)
//
// .with_body_from_file("tests/resources/mikan/ExpandBangumi.htm")
// .create_async()
// .await;
// let auth_secrecy = Some(MikanCredentialForm {
// username: String::from("test_username"),
// password: String::from("test_password"),
// user_agent: String::from(
// "Mozilla/5.0 (Windows NT 10.0; Win64; x64)
// AppleWebKit/537.36 (KHTML, like \ Gecko)
// Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0", ),
// });
// let bangumi_metas =
// extract_mikan_bangumis_meta_from_my_bangumi_page(
// context.clone(), my_bangumi_page_url,
// auth_secrecy,
// &[],
// );
// pin_mut!(bangumi_metas);
// let bangumi_metas = bangumi_metas.try_collect::<Vec<_>>().await?;
// assert!(!bangumi_metas.is_empty());
// assert!(bangumi_metas[0].origin_poster_src.is_some());
// assert!(my_bangumi_with_cookie_mock.matched_async().await);
// expand_bangumi_mock.expect(bangumi_metas.len());
// }
// Ok(())
// }
}