fix: fix scrape mikan season bangumi list

This commit is contained in:
2025-05-06 02:23:17 +08:00
parent 439353d318
commit a7f52fe0eb
9 changed files with 912 additions and 165 deletions

View File

@@ -2,7 +2,9 @@ use std::{fmt::Debug, ops::Deref, sync::Arc};
use fetch::{HttpClient, HttpClientTrait};
use maplit::hashmap;
use sea_orm::{ActiveModelTrait, ActiveValue::Set, DbErr, TryIntoModel};
use sea_orm::{
ActiveModelTrait, ActiveValue::Set, ColumnTrait, DbErr, EntityTrait, QueryFilter, TryIntoModel,
};
use url::Url;
use util::OptDynErr;
@@ -137,7 +139,7 @@ impl MikanClient {
}
}
pub async fn save_credential(
pub async fn submit_credential_form(
&self,
ctx: Arc<dyn AppContextTrait>,
subscriber_id: i32,
@@ -159,49 +161,67 @@ impl MikanClient {
Ok(credential)
}
pub async fn sync_credential_cookies(
&self,
ctx: Arc<dyn AppContextTrait>,
credential_id: i32,
) -> RecorderResult<()> {
let cookies = self.http_client.save_cookie_store_to_json()?;
if let Some(cookies) = cookies {
let am = credential_3rd::ActiveModel {
cookies: Set(Some(cookies)),
..Default::default()
}
.try_encrypt(ctx.clone())
.await?;
credential_3rd::Entity::update_many()
.set(am)
.filter(credential_3rd::Column::Id.eq(credential_id))
.exec(ctx.db())
.await?;
}
Ok(())
}
pub async fn fork_with_credential(
&self,
ctx: Arc<dyn AppContextTrait>,
credential_id: Option<i32>,
credential_id: i32,
) -> RecorderResult<Self> {
let mut fork = self.http_client.fork();
let mut userpass_credential_opt = None;
if let Some(credential_id) = credential_id {
let credential = credential_3rd::Model::find_by_id(ctx.clone(), credential_id).await?;
if let Some(credential) = credential {
if credential.credential_type != Credential3rdType::Mikan {
return Err(RecorderError::Credential3rdError {
message: "credential is not a mikan credential".to_string(),
source: None.into(),
});
}
let userpass_credential: UserPassCredential =
credential.try_into_userpass_credential(ctx)?;
if let Some(cookies) = userpass_credential.cookies.as_ref() {
fork = fork.attach_cookies(cookies)?;
}
if let Some(user_agent) = userpass_credential.user_agent.as_ref() {
fork = fork.attach_user_agent(user_agent);
}
userpass_credential_opt = Some(userpass_credential);
} else {
return Err(RecorderError::from_db_record_not_found(
DbErr::RecordNotFound(format!("credential={credential_id} not found")),
));
let credential = credential_3rd::Model::find_by_id(ctx.clone(), credential_id).await?;
if let Some(credential) = credential {
if credential.credential_type != Credential3rdType::Mikan {
return Err(RecorderError::Credential3rdError {
message: "credential is not a mikan credential".to_string(),
source: None.into(),
});
}
}
Ok(Self {
http_client: HttpClient::from_fork(fork)?,
base_url: self.base_url.clone(),
origin_url: self.origin_url.clone(),
userpass_credential: userpass_credential_opt,
})
let userpass_credential: UserPassCredential =
credential.try_into_userpass_credential(ctx)?;
fork = fork.attach_cookies(userpass_credential.cookies.as_deref())?;
if let Some(user_agent) = userpass_credential.user_agent.as_ref() {
fork = fork.attach_user_agent(user_agent);
}
let userpass_credential_opt = Some(userpass_credential);
Ok(Self {
http_client: HttpClient::from_fork(fork)?,
base_url: self.base_url.clone(),
origin_url: self.origin_url.clone(),
userpass_credential: userpass_credential_opt,
})
} else {
Err(RecorderError::from_db_record_not_found(
DbErr::RecordNotFound(format!("credential={credential_id} not found")),
))
}
}
pub fn base_url(&self) -> &Url {

View File

@@ -2,5 +2,7 @@ pub const MIKAN_POSTER_BUCKET_KEY: &str = "mikan_poster";
pub const MIKAN_UNKNOWN_FANSUB_NAME: &str = "生肉/不明字幕";
pub const MIKAN_UNKNOWN_FANSUB_ID: &str = "202";
pub const MIKAN_LOGIN_PAGE_PATH: &str = "/Account/Login";
pub const MIKAN_LOGIN_PAGE_SEARCH: &str = "?ReturnUrl=%2F";
pub const MIKAN_LOGIN_PAGE_SEARCH: &str = "ReturnUrl=%2F";
pub const MIKAN_ACCOUNT_MANAGE_PAGE_PATH: &str = "/Account/Manage";
pub const MIKAN_SEASON_FLOW_PAGE_PATH: &str = "/Home/BangumiCoverFlow";
pub const MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH: &str = "/Home/ExpandBangumi";

View File

@@ -8,8 +8,9 @@ mod web;
pub use client::MikanClient;
pub use config::MikanConfig;
pub use constants::{
MIKAN_ACCOUNT_MANAGE_PAGE_PATH, MIKAN_LOGIN_PAGE_PATH, MIKAN_LOGIN_PAGE_SEARCH,
MIKAN_POSTER_BUCKET_KEY, MIKAN_UNKNOWN_FANSUB_ID, MIKAN_UNKNOWN_FANSUB_NAME,
MIKAN_ACCOUNT_MANAGE_PAGE_PATH, MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH,
MIKAN_LOGIN_PAGE_PATH, MIKAN_LOGIN_PAGE_SEARCH, MIKAN_POSTER_BUCKET_KEY,
MIKAN_SEASON_FLOW_PAGE_PATH, MIKAN_UNKNOWN_FANSUB_ID, MIKAN_UNKNOWN_FANSUB_NAME,
};
pub use credential::MikanCredentialForm;
pub use rss::{

View File

@@ -10,7 +10,8 @@ use tracing::instrument;
use url::Url;
use super::{
MIKAN_POSTER_BUCKET_KEY, MikanBangumiRssUrlMeta, MikanClient,
MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH, MIKAN_POSTER_BUCKET_KEY,
MIKAN_SEASON_FLOW_PAGE_PATH, MikanBangumiRssUrlMeta, MikanClient,
extract_mikan_bangumi_id_from_rss_url,
};
use crate::{
@@ -183,7 +184,7 @@ pub fn build_mikan_season_flow_url(
season_str: MikanSeasonStr,
) -> Url {
let mut url = mikan_base_url;
url.set_path("/Home/BangumiCoverFlow");
url.set_path(MIKAN_SEASON_FLOW_PAGE_PATH);
url.query_pairs_mut()
.append_pair("year", &year.to_string())
.append_pair("seasonStr", &season_str.to_string());
@@ -201,7 +202,7 @@ pub fn build_mikan_bangumi_expand_subscribed_url(
mikan_bangumi_id: &str,
) -> Url {
let mut url = mikan_base_url;
url.set_path("/ExpandBangumi");
url.set_path(MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH);
url.query_pairs_mut()
.append_pair("bangumiId", mikan_bangumi_id)
.append_pair("showSubscribed", "true");
@@ -651,7 +652,7 @@ pub async fn scrape_mikan_bangumi_meta_list_from_season_flow_url(
credential_id: i32,
) -> RecorderResult<Vec<MikanBangumiMeta>> {
let mikan_client = mikan_client
.fork_with_credential(ctx.clone(), Some(credential_id))
.fork_with_credential(ctx.clone(), credential_id)
.await?;
let mikan_base_url = mikan_client.base_url();
@@ -671,6 +672,10 @@ pub async fn scrape_mikan_bangumi_meta_list_from_season_flow_url(
let mut bangumi_metas = vec![];
mikan_client
.sync_credential_cookies(ctx.clone(), credential_id)
.await?;
for bangumi_index in bangumi_indices_meta {
let bangumi_title = bangumi_index.bangumi_title.clone();
let bangumi_expand_subscribed_fragment_url = build_mikan_bangumi_expand_subscribed_url(
@@ -696,6 +701,10 @@ pub async fn scrape_mikan_bangumi_meta_list_from_season_flow_url(
bangumi_metas.push(bangumi_meta);
}
mikan_client
.sync_credential_cookies(ctx, credential_id)
.await?;
Ok(bangumi_metas)
}
@@ -704,7 +713,6 @@ mod test {
#![allow(unused_variables)]
use std::fs;
use fetch::get_random_ua;
use rstest::{fixture, rstest};
use tracing::Level;
use url::Url;
@@ -712,11 +720,16 @@ mod test {
use super::*;
use crate::{
extract::mikan::MikanCredentialForm,
extract::mikan::{MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH, MIKAN_SEASON_FLOW_PAGE_PATH},
test_utils::{
app::UnitTestAppContext, crypto::build_testing_crypto_service,
database::build_testing_database_service, mikan::build_testing_mikan_client,
storage::build_testing_storage_service, tracing::try_init_testing_tracing,
app::UnitTestAppContext,
crypto::build_testing_crypto_service,
database::build_testing_database_service,
mikan::{
MikanMockServer, build_testing_mikan_client, build_testing_mikan_credential_form,
},
storage::build_testing_storage_service,
tracing::try_init_testing_tracing,
},
};
@@ -932,8 +945,8 @@ mod test {
async fn test_scrape_mikan_bangumi_meta_list_from_season_flow_url(
before_each: (),
) -> RecorderResult<()> {
let mut mikan_server = mockito::Server::new_async().await;
let mikan_base_url = Url::parse(&mikan_server.url())?;
let mut mikan_server = MikanMockServer::new().await?;
let mikan_base_url = mikan_server.base_url().clone();
let app_ctx = {
let mikan_client = build_testing_mikan_client(mikan_base_url.clone()).await?;
@@ -950,20 +963,50 @@ mod test {
let mikan_client = app_ctx.mikan();
let login_mock = mikan_server.mock_get_login_page();
let season_flow_noauth_mock = mikan_server
.server
.mock("GET", MIKAN_SEASON_FLOW_PAGE_PATH)
.match_query(mockito::Matcher::Any)
.match_request(|req| !MikanMockServer::get_has_auth_matcher()(req))
.with_status(200)
.with_body_from_file("tests/resources/mikan/BangumiCoverFlow-2025-spring-noauth.html")
.create();
let season_flow_mock = mikan_server
.server
.mock("GET", MIKAN_SEASON_FLOW_PAGE_PATH)
.match_query(mockito::Matcher::Any)
.match_request(|req| MikanMockServer::get_has_auth_matcher()(req))
.with_status(200)
.with_body_from_file("tests/resources/mikan/BangumiCoverFlow-2025-spring.html")
.create();
let bangumi_subscribed_noauth_mock = mikan_server
.server
.mock("GET", MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH)
.match_query(mockito::Matcher::Any)
.match_request(|req| !MikanMockServer::get_has_auth_matcher()(req))
.with_status(200)
.with_body_from_file("tests/resources/mikan/ExpandBangumi-3599-noauth.html")
.create();
let bangumi_subscribed_mock = mikan_server
.server
.mock("GET", MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH)
.match_query(mockito::Matcher::Any)
.match_request(|req| MikanMockServer::get_has_auth_matcher()(req))
.with_status(200)
.with_body_from_file("tests/resources/mikan/ExpandBangumi-3599.html")
.create();
let credential = mikan_client
.save_credential(
app_ctx.clone(),
1,
MikanCredentialForm {
username: String::from("test_username"),
password: String::from("test_password"),
user_agent: get_random_ua().to_string(),
},
)
.submit_credential_form(app_ctx.clone(), 1, build_testing_mikan_credential_form())
.await?;
let mikan_season_flow_url =
build_mikan_season_flow_url(mikan_base_url, 2025, MikanSeasonStr::Spring);
build_mikan_season_flow_url(mikan_base_url.clone(), 2025, MikanSeasonStr::Spring);
let bangumi_meta_list = scrape_mikan_bangumi_meta_list_from_season_flow_url(
mikan_client,
@@ -975,6 +1018,26 @@ mod test {
assert!(!bangumi_meta_list.is_empty());
let bangumi = bangumi_meta_list.first().unwrap();
assert!(
bangumi
.homepage
.to_string()
.ends_with("/Home/Bangumi/3288#370"),
);
assert_eq!(bangumi.bangumi_title, "吉伊卡哇");
assert_eq!(bangumi.mikan_bangumi_id, "3288");
assert!(
bangumi
.origin_poster_src
.as_ref()
.map_or(String::new(), |u| u.to_string())
.ends_with("/images/Bangumi/202204/d8ef46c0.jpg")
);
assert_eq!(bangumi.mikan_fansub_id, String::from("370"));
assert_eq!(bangumi.fansub, String::from("LoliHouse"));
Ok(())
}
@@ -987,8 +1050,9 @@ mod test {
let mikan_base_url = Url::parse(&mikan_server.url())?;
let mikan_client = build_testing_mikan_client(mikan_base_url.clone()).await?;
let episode_homepage_url =
mikan_base_url.join("/Home/Episode/475184dce83ea2b82902592a5ac3343f6d54b36a")?;
let episode_homepage_url = mikan_base_url
.clone()
.join("/Home/Episode/475184dce83ea2b82902592a5ac3343f6d54b36a")?;
let episode_homepage_mock = mikan_server
.mock("GET", episode_homepage_url.path())
@@ -1058,101 +1122,4 @@ mod test {
Ok(())
}
// #[rstest]
// #[tokio::test]
// async fn test_extract_mikan_bangumis_meta_from_my_bangumi_page(
// before_each: (),
// ) -> RecorderResult<()> {
// let mut mikan_server = mockito::Server::new_async().await;
// let mikan_base_url = Url::parse(&mikan_server.url())?;
// let my_bangumi_page_url = mikan_base_url.join("/Home/MyBangumi")?;
// let context = Arc::new(
// UnitTestAppContext::builder()
//
// .mikan(build_testing_mikan_client(mikan_base_url.clone()).await?)
// .build(),
// );
// {
// let my_bangumi_without_cookie_mock = mikan_server
// .mock("GET", my_bangumi_page_url.path())
// .match_header(header::COOKIE, mockito::Matcher::Missing)
//
// .with_body_from_file("tests/resources/mikan/MyBangumi-noauth.htm")
// .create_async()
// .await;
// let bangumi_metas =
// extract_mikan_bangumis_meta_from_my_bangumi_page(
// context.clone(), my_bangumi_page_url.clone(),
// None,
// &[],
// );
// pin_mut!(bangumi_metas);
// let bangumi_metas = bangumi_metas.try_collect::<Vec<_>>().await?;
// assert!(bangumi_metas.is_empty());
// assert!(my_bangumi_without_cookie_mock.matched_async().await);
// }
// {
// let my_bangumi_with_cookie_mock = mikan_server
// .mock("GET", my_bangumi_page_url.path())
// .match_header(
// header::COOKIE,
// mockito::Matcher::AllOf(vec![
//
// mockito::Matcher::Regex(String::from(".*\\.AspNetCore\\.Antiforgery.*")),
// mockito::Matcher::Regex(String::from(
// ".*\\.AspNetCore\\.Identity\\.Application.*",
// )),
// ]),
// )
// .with_body_from_file("tests/resources/mikan/MyBangumi.htm")
// .create_async()
// .await;
// let expand_bangumi_mock = mikan_server
// .mock("GET", "/ExpandBangumi")
// .match_query(mockito::Matcher::Any)
//
// .with_body_from_file("tests/resources/mikan/ExpandBangumi.htm")
// .create_async()
// .await;
// let auth_secrecy = Some(MikanCredentialForm {
// username: String::from("test_username"),
// password: String::from("test_password"),
// user_agent: String::from(
// "Mozilla/5.0 (Windows NT 10.0; Win64; x64)
// AppleWebKit/537.36 (KHTML, like \ Gecko)
// Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0", ),
// });
// let bangumi_metas =
// extract_mikan_bangumis_meta_from_my_bangumi_page(
// context.clone(), my_bangumi_page_url,
// auth_secrecy,
// &[],
// );
// pin_mut!(bangumi_metas);
// let bangumi_metas = bangumi_metas.try_collect::<Vec<_>>().await?;
// assert!(!bangumi_metas.is_empty());
// assert!(bangumi_metas[0].origin_poster_src.is_some());
// assert!(my_bangumi_with_cookie_mock.matched_async().await);
// expand_bangumi_mock.expect(bangumi_metas.len());
// }
// Ok(())
// }
}

View File

@@ -1,10 +1,22 @@
use fetch::{FetchError, HttpClientConfig, IntoUrl};
use std::collections::HashMap;
use chrono::{Duration, Utc};
use fetch::{FetchError, HttpClientConfig, IntoUrl, get_random_ua};
use url::Url;
use crate::{
errors::RecorderResult,
extract::mikan::{MikanClient, MikanConfig},
extract::mikan::{
MIKAN_ACCOUNT_MANAGE_PAGE_PATH, MIKAN_LOGIN_PAGE_PATH, MikanClient, MikanConfig,
MikanCredentialForm,
},
};
const TESTING_MIKAN_USERNAME: &str = "test_username";
const TESTING_MIKAN_PASSWORD: &str = "test_password";
const TESTING_MIKAN_ANTIFORGERY: &str = "test_antiforgery";
const TESTING_MIKAN_IDENTITY: &str = "test_identity";
pub async fn build_testing_mikan_client(
base_mikan_url: impl IntoUrl,
) -> RecorderResult<MikanClient> {
@@ -17,3 +29,145 @@ pub async fn build_testing_mikan_client(
.await?;
Ok(mikan_client)
}
pub fn build_testing_mikan_credential_form() -> MikanCredentialForm {
MikanCredentialForm {
username: String::from(TESTING_MIKAN_USERNAME),
password: String::from(TESTING_MIKAN_PASSWORD),
user_agent: get_random_ua().to_string(),
}
}
pub struct MikanMockServerLoginMock {
pub login_get_mock: mockito::Mock,
pub login_post_success_mock: mockito::Mock,
pub login_post_failed_mock: mockito::Mock,
pub account_get_success_mock: mockito::Mock,
pub account_get_failed_mock: mockito::Mock,
}
pub struct MikanMockServer {
pub server: mockito::ServerGuard,
base_url: Url,
}
impl MikanMockServer {
pub async fn new() -> RecorderResult<Self> {
let server = mockito::Server::new_async().await;
let base_url = Url::parse(&server.url())?;
Ok(Self { server, base_url })
}
pub fn base_url(&self) -> &Url {
&self.base_url
}
pub fn get_has_auth_matcher() -> impl Fn(&mockito::Request) -> bool {
|req: &mockito::Request| -> bool {
let test_identity_cookie =
format!(".AspNetCore.Identity.Application={TESTING_MIKAN_IDENTITY}");
req.header("Cookie").iter().any(|cookie| {
cookie
.to_str()
.is_ok_and(|c| c.contains(&test_identity_cookie))
})
}
}
pub fn mock_get_login_page(&mut self) -> MikanMockServerLoginMock {
let login_get_mock = self
.server
.mock("GET", MIKAN_LOGIN_PAGE_PATH)
.match_query(mockito::Matcher::Any)
.with_status(201)
.with_header("Content-Type", "text/html; charset=utf-8")
.with_header(
"Set-Cookie",
&format!(
".AspNetCore.Antiforgery.test_app_id={TESTING_MIKAN_ANTIFORGERY}; HttpOnly; \
SameSite=Strict; Path=/"
),
)
.create();
let test_identity_expires = (Utc::now() + Duration::days(30)).to_rfc2822();
let match_post_login_body = |req: &mockito::Request| {
req.body()
.map(|b| url::form_urlencoded::parse(b))
.is_ok_and(|queires| {
let qs = queires.collect::<HashMap<_, _>>();
qs.get("UserName")
.is_some_and(|s| s == TESTING_MIKAN_USERNAME)
&& qs
.get("Password")
.is_some_and(|s| s == TESTING_MIKAN_PASSWORD)
&& qs
.get("__RequestVerificationToken")
.is_some_and(|s| s == TESTING_MIKAN_ANTIFORGERY)
})
};
let login_post_success_mock = {
let mikan_base_url = self.base_url().clone();
self.server
.mock("POST", MIKAN_LOGIN_PAGE_PATH)
.match_query(mockito::Matcher::Any)
.match_request(match_post_login_body)
.with_status(302)
.with_header(
"Set-Cookie",
&format!(
".AspNetCore.Identity.Application={TESTING_MIKAN_IDENTITY}; HttpOnly; \
SameSite=Lax; Path=/; Expires=${test_identity_expires}"
),
)
.with_header_from_request("Location", move |req| {
let request_url = mikan_base_url.join(req.path_and_query()).ok();
request_url
.and_then(|u| {
u.query_pairs()
.find(|(key, _)| key == "ReturnUrl")
.map(|(_, value)| value.to_string())
})
.unwrap_or(String::from("/"))
})
.create()
};
let login_post_failed_mock = self
.server
.mock("POST", MIKAN_LOGIN_PAGE_PATH)
.match_query(mockito::Matcher::Any)
.match_request(move |req| !match_post_login_body(req))
.with_status(200)
.with_body_from_file("tests/resources/mikan/LoginError.html")
.create();
let account_get_success_mock = self
.server
.mock("GET", MIKAN_ACCOUNT_MANAGE_PAGE_PATH)
.match_query(mockito::Matcher::Any)
.match_request(move |req| Self::get_has_auth_matcher()(req))
.with_status(200)
.create();
let account_get_failed_mock = self
.server
.mock("GET", MIKAN_ACCOUNT_MANAGE_PAGE_PATH)
.match_query(mockito::Matcher::Any)
.match_request(move |req| !Self::get_has_auth_matcher()(req))
.with_status(302)
.with_header("Location", MIKAN_LOGIN_PAGE_PATH)
.create();
MikanMockServerLoginMock {
login_get_mock,
login_post_success_mock,
login_post_failed_mock,
account_get_success_mock,
account_get_failed_mock,
}
}
}