feature: add new mikan scrapers

This commit is contained in:
2025-05-03 04:23:33 +08:00
parent dbded94324
commit 3fe0538468
36 changed files with 1001 additions and 793 deletions

View File

@@ -108,7 +108,7 @@ pub fn parse_episode_media_meta_from_torrent(
let media_name = torrent_path
.file_name()
.with_whatever_context::<_, _, RecorderError>(|| {
format!("failed to get file name of {}", torrent_path)
format!("failed to get file name of {torrent_path}")
})?;
let mut match_obj = None;
for rule in TORRENT_EP_PARSE_RULES.iter() {
@@ -141,7 +141,7 @@ pub fn parse_episode_media_meta_from_torrent(
.unwrap_or(1);
let extname = torrent_path
.extension()
.map(|e| format!(".{}", e))
.map(|e| format!(".{e}"))
.unwrap_or_default();
Ok(TorrentEpisodeMediaMeta {
fansub: fansub.map(|s| s.to_string()),
@@ -168,7 +168,7 @@ pub fn parse_episode_subtitle_meta_from_torrent(
let media_name = torrent_path
.file_name()
.with_whatever_context::<_, _, RecorderError>(|| {
format!("failed to get file name of {}", torrent_path)
format!("failed to get file name of {torrent_path}")
})?;
let lang = get_subtitle_lang(media_name);
@@ -271,7 +271,7 @@ mod tests {
pub fn test_torrent_ep_parser(raw_name: &str, expected: &str) {
let extname = Path::new(raw_name)
.extension()
.map(|e| format!(".{}", e))
.map(|e| format!(".{e}"))
.unwrap_or_default()
.to_lowercase();

View File

@@ -19,21 +19,19 @@ pub fn extract_background_image_src_from_style_attr(
match prop {
Property::BackgroundImage(images) => {
for img in images {
if let CSSImage::Url(path) = img {
if let Some(url) = extract_image_src_from_str(path.url.trim(), base_url)
{
return Some(url);
}
if let CSSImage::Url(path) = img
&& let Some(url) = extract_image_src_from_str(path.url.trim(), base_url)
{
return Some(url);
}
}
}
Property::Background(backgrounds) => {
for bg in backgrounds {
if let CSSImage::Url(path) = &bg.image {
if let Some(url) = extract_image_src_from_str(path.url.trim(), base_url)
{
return Some(url);
}
if let CSSImage::Url(path) = &bg.image
&& let Some(url) = extract_image_src_from_str(path.url.trim(), base_url)
{
return Some(url);
}
}
}

View File

@@ -1,4 +1,4 @@
use axum::http::{header, request::Parts, HeaderName, HeaderValue, Uri};
use axum::http::{HeaderName, HeaderValue, Uri, header, request::Parts};
use itertools::Itertools;
use url::Url;
@@ -121,11 +121,7 @@ impl ForwardedRelatedInfo {
.and_then(|s| s.to_str().ok())
.and_then(|s| {
let l = s.split(",").map(|s| s.trim().to_string()).collect_vec();
if l.is_empty() {
None
} else {
Some(l)
}
if l.is_empty() { None } else { Some(l) }
});
let host = headers
@@ -165,7 +161,7 @@ impl ForwardedRelatedInfo {
pub fn resolved_origin(&self) -> Option<Url> {
if let (Some(protocol), Some(host)) = (self.resolved_protocol(), self.resolved_host()) {
let origin = format!("{}://{}", protocol, host);
let origin = format!("{protocol}://{host}");
Url::parse(&origin).ok()
} else {
None

View File

@@ -3,7 +3,7 @@ use url::Url;
pub fn extract_image_src_from_str(image_src: &str, base_url: &Url) -> Option<Url> {
let mut image_url = base_url.join(image_src).ok()?;
if let Some((_, value)) = image_url.query_pairs().find(|(key, _)| key == "webp") {
image_url.set_query(Some(&format!("webp={}", value)));
image_url.set_query(Some(&format!("webp={value}")));
} else {
image_url.set_query(None);
}

View File

@@ -3,7 +3,6 @@ use std::{fmt::Debug, ops::Deref, sync::Arc};
use fetch::{HttpClient, HttpClientTrait};
use maplit::hashmap;
use sea_orm::DbErr;
use secrecy::SecretBox;
use serde::{Deserialize, Serialize};
use url::Url;
use util::OptDynErr;
@@ -23,8 +22,6 @@ pub struct MikanCredentialForm {
pub user_agent: String,
}
pub type MikanAuthSecrecy = SecretBox<MikanCredentialForm>;
impl Debug for MikanCredentialForm {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("MikanCredentialForm")
@@ -72,7 +69,7 @@ impl MikanClient {
Ok(false)
} else {
Err(RecorderError::Credential3rdError {
message: format!("mikan account check has login failed, status = {}", status),
message: format!("mikan account check has login failed, status = {status}"),
source: None.into(),
})
}
@@ -189,7 +186,7 @@ impl MikanClient {
userpass_credential_opt = Some(userpass_credential);
} else {
return Err(RecorderError::from_db_record_not_found(
DbErr::RecordNotFound(format!("credential={} not found", credential_id)),
DbErr::RecordNotFound(format!("credential={credential_id} not found")),
));
}
}

View File

@@ -1,4 +1,4 @@
pub const MIKAN_BUCKET_KEY: &str = "mikan";
pub const MIKAN_POSTER_BUCKET_KEY: &str = "mikan_poster";
pub const MIKAN_UNKNOWN_FANSUB_NAME: &str = "生肉/不明字幕";
pub const MIKAN_UNKNOWN_FANSUB_ID: &str = "202";
pub const MIKAN_LOGIN_PAGE_PATH: &str = "/Account/Login";

View File

@@ -1,23 +1,31 @@
pub mod client;
pub mod config;
pub mod constants;
pub mod rss_extract;
pub mod web_extract;
mod client;
mod config;
mod constants;
mod rss;
mod web;
pub use client::{MikanClient, MikanCredentialForm};
pub use config::MikanConfig;
pub use constants::MIKAN_BUCKET_KEY;
pub use rss_extract::{
MikanBangumiAggregationRssChannel, MikanBangumiRssChannel, MikanBangumiRssUrlMeta,
MikanRssChannel, MikanRssItem, MikanSubscriberAggregationRssChannel,
MikanSubscriberAggregationRssUrlMeta, build_mikan_bangumi_rss_url,
build_mikan_subscriber_aggregation_rss_url, extract_mikan_bangumi_id_from_rss_url,
extract_mikan_rss_channel_from_rss_link, extract_mikan_subscriber_aggregation_id_from_rss_link,
pub use constants::{
MIKAN_ACCOUNT_MANAGE_PAGE_PATH, MIKAN_LOGIN_PAGE_PATH, MIKAN_LOGIN_PAGE_SEARCH,
MIKAN_POSTER_BUCKET_KEY, MIKAN_UNKNOWN_FANSUB_ID, MIKAN_UNKNOWN_FANSUB_NAME,
};
pub use web_extract::{
MikanBangumiMeta, MikanEpisodeMeta, MikanSeasonStr, build_mikan_bangumi_homepage_url,
build_mikan_episode_homepage_url, build_mikan_season_flow_url,
extract_mikan_bangumi_indices_meta_from_season_flow_fragment,
extract_mikan_bangumi_meta_from_bangumi_homepage,
extract_mikan_episode_meta_from_episode_homepage,
pub use rss::{
MikanBangumiIndexRssChannel, MikanBangumiRssChannel, MikanBangumiRssUrlMeta, MikanRssChannel,
MikanRssItem, MikanSubscriberAggregationRssUrlMeta, MikanSubscriberStreamRssChannel,
build_mikan_bangumi_rss_url, build_mikan_subscriber_aggregation_rss_url,
extract_mikan_bangumi_id_from_rss_url, extract_mikan_rss_channel_from_rss_link,
extract_mikan_subscriber_aggregation_id_from_rss_link,
};
pub use web::{
MikanBangumiHomepageUrlMeta, MikanBangumiIndexHomepageUrlMeta, MikanBangumiIndexMeta,
MikanBangumiMeta, MikanBangumiPosterMeta, MikanEpisodeHomepageUrlMeta, MikanEpisodeMeta,
MikanSeasonFlowUrlMeta, MikanSeasonStr, build_mikan_bangumi_expand_subscribed_url,
build_mikan_bangumi_homepage_url, build_mikan_episode_homepage_url,
build_mikan_season_flow_url, extract_mikan_bangumi_index_meta_list_from_season_flow_fragment,
extract_mikan_episode_meta_from_episode_homepage_html,
scrape_mikan_bangumi_meta_from_bangumi_homepage_url,
scrape_mikan_bangumi_meta_list_from_season_flow_url,
scrape_mikan_episode_meta_from_episode_homepage_url, scrape_mikan_poster_data_from_image_url,
scrape_mikan_poster_meta_from_image_url,
};

View File

@@ -10,10 +10,7 @@ use url::Url;
use crate::{
errors::app_error::{RecorderError, RecorderResult},
extract::mikan::{
MikanClient,
web_extract::{MikanEpisodeHomepage, extract_mikan_episode_id_from_homepage_url},
},
extract::mikan::{MikanClient, MikanEpisodeHomepageUrlMeta},
};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -37,7 +34,7 @@ pub struct MikanBangumiRssChannel {
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct MikanBangumiAggregationRssChannel {
pub struct MikanBangumiIndexRssChannel {
pub name: String,
pub url: Url,
pub mikan_bangumi_id: String,
@@ -45,7 +42,7 @@ pub struct MikanBangumiAggregationRssChannel {
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct MikanSubscriberAggregationRssChannel {
pub struct MikanSubscriberStreamRssChannel {
pub mikan_aggregation_id: String,
pub url: Url,
pub items: Vec<MikanRssItem>,
@@ -54,46 +51,40 @@ pub struct MikanSubscriberAggregationRssChannel {
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum MikanRssChannel {
Bangumi(MikanBangumiRssChannel),
BangumiAggregation(MikanBangumiAggregationRssChannel),
SubscriberAggregation(MikanSubscriberAggregationRssChannel),
BangumiIndex(MikanBangumiIndexRssChannel),
SubscriberStream(MikanSubscriberStreamRssChannel),
}
impl MikanRssChannel {
pub fn items(&self) -> &[MikanRssItem] {
match &self {
Self::Bangumi(MikanBangumiRssChannel { items, .. })
| Self::BangumiAggregation(MikanBangumiAggregationRssChannel { items, .. })
| Self::SubscriberAggregation(MikanSubscriberAggregationRssChannel { items, .. }) => {
items
}
| Self::BangumiIndex(MikanBangumiIndexRssChannel { items, .. })
| Self::SubscriberStream(MikanSubscriberStreamRssChannel { items, .. }) => items,
}
}
pub fn into_items(self) -> Vec<MikanRssItem> {
match self {
Self::Bangumi(MikanBangumiRssChannel { items, .. })
| Self::BangumiAggregation(MikanBangumiAggregationRssChannel { items, .. })
| Self::SubscriberAggregation(MikanSubscriberAggregationRssChannel { items, .. }) => {
items
}
| Self::BangumiIndex(MikanBangumiIndexRssChannel { items, .. })
| Self::SubscriberStream(MikanSubscriberStreamRssChannel { items, .. }) => items,
}
}
pub fn name(&self) -> Option<&str> {
match &self {
Self::Bangumi(MikanBangumiRssChannel { name, .. })
| Self::BangumiAggregation(MikanBangumiAggregationRssChannel { name, .. }) => {
Some(name.as_str())
}
Self::SubscriberAggregation(MikanSubscriberAggregationRssChannel { .. }) => None,
| Self::BangumiIndex(MikanBangumiIndexRssChannel { name, .. }) => Some(name.as_str()),
Self::SubscriberStream(MikanSubscriberStreamRssChannel { .. }) => None,
}
}
pub fn url(&self) -> &Url {
match &self {
Self::Bangumi(MikanBangumiRssChannel { url, .. })
| Self::BangumiAggregation(MikanBangumiAggregationRssChannel { url, .. })
| Self::SubscriberAggregation(MikanSubscriberAggregationRssChannel { url, .. }) => url,
| Self::BangumiIndex(MikanBangumiIndexRssChannel { url, .. })
| Self::SubscriberStream(MikanSubscriberStreamRssChannel { url, .. }) => url,
}
}
}
@@ -133,9 +124,9 @@ impl TryFrom<rss::Item> for MikanRssItem {
RecorderError::from_mikan_rss_invalid_field(Cow::Borrowed("homepage:link"))
})?;
let MikanEpisodeHomepage {
let MikanEpisodeHomepageUrlMeta {
mikan_episode_id, ..
} = extract_mikan_episode_id_from_homepage_url(&homepage).ok_or_else(|| {
} = MikanEpisodeHomepageUrlMeta::parse_url(&homepage).ok_or_else(|| {
RecorderError::from_mikan_rss_invalid_field(Cow::Borrowed("mikan_episode_id"))
})?;
@@ -278,17 +269,15 @@ pub async fn extract_mikan_rss_channel_from_rss_link(
channel_name,
channel_link = channel_link.as_str(),
mikan_bangumi_id,
"MikanBangumiAggregationRssChannel extracted"
"MikanBangumiIndexRssChannel extracted"
);
Ok(MikanRssChannel::BangumiAggregation(
MikanBangumiAggregationRssChannel {
name: channel_name,
mikan_bangumi_id,
url: channel_link,
items,
},
))
Ok(MikanRssChannel::BangumiIndex(MikanBangumiIndexRssChannel {
name: channel_name,
mikan_bangumi_id,
url: channel_link,
items,
}))
}
} else if let Some(MikanSubscriberAggregationRssUrlMeta {
mikan_aggregation_id,
@@ -317,8 +306,8 @@ pub async fn extract_mikan_rss_channel_from_rss_link(
"MikanSubscriberAggregationRssChannel extracted"
);
Ok(MikanRssChannel::SubscriberAggregation(
MikanSubscriberAggregationRssChannel {
Ok(MikanRssChannel::SubscriberStream(
MikanSubscriberStreamRssChannel {
mikan_aggregation_id,
items,
url: channel_link,
@@ -342,7 +331,7 @@ mod tests {
use crate::{
errors::RecorderResult,
extract::mikan::{
MikanBangumiAggregationRssChannel, MikanBangumiRssChannel, MikanRssChannel,
MikanBangumiIndexRssChannel, MikanBangumiRssChannel, MikanRssChannel,
extract_mikan_rss_channel_from_rss_link,
},
test_utils::mikan::build_testing_mikan_client,
@@ -413,7 +402,7 @@ mod tests {
assert_matches!(
&channel,
MikanRssChannel::BangumiAggregation(MikanBangumiAggregationRssChannel { .. })
MikanRssChannel::BangumiIndex(MikanBangumiIndexRssChannel { .. })
);
assert_matches!(&channel.name(), Some("叹气的亡灵想隐退"));

View File

@@ -101,19 +101,19 @@ fn title_body_pre_process(title_body: &str, fansub: Option<&str>) -> RecorderRes
raw = sub.replace_all(&raw, "").to_string();
}
}
if let Some(m) = MAIN_TITLE_PRE_PROCESS_BACKETS_RE.find(&raw) {
if m.len() as f32 > (raw.len() as f32) * 0.5 {
let mut raw1 = MAIN_TITLE_PRE_PROCESS_BACKETS_RE_SUB1
.replace(&raw, "")
.chars()
.collect_vec();
while let Some(ch) = raw1.pop() {
if ch == ']' {
break;
}
if let Some(m) = MAIN_TITLE_PRE_PROCESS_BACKETS_RE.find(&raw)
&& m.len() as f32 > (raw.len() as f32) * 0.5
{
let mut raw1 = MAIN_TITLE_PRE_PROCESS_BACKETS_RE_SUB1
.replace(&raw, "")
.chars()
.collect_vec();
while let Some(ch) = raw1.pop() {
if ch == ']' {
break;
}
raw = raw1.into_iter().collect();
}
raw = raw1.into_iter().collect();
}
Ok(raw.to_string())
}
@@ -136,23 +136,21 @@ pub fn extract_season_from_title_body(title_body: &str) -> (String, Option<Strin
for s in seasons {
season_raw = Some(s);
if let Some(m) = SEASON_EXTRACT_SEASON_EN_PREFIX_RE.find(s) {
if let Ok(s) = SEASON_EXTRACT_SEASON_ALL_RE
if let Some(m) = SEASON_EXTRACT_SEASON_EN_PREFIX_RE.find(s)
&& let Ok(s) = SEASON_EXTRACT_SEASON_ALL_RE
.replace_all(m.as_str(), "")
.parse::<i32>()
{
season = s;
break;
}
{
season = s;
break;
}
if let Some(m) = SEASON_EXTRACT_SEASON_EN_NTH_RE.find(s) {
if let Some(s) = DIGIT_1PLUS_REG
if let Some(m) = SEASON_EXTRACT_SEASON_EN_NTH_RE.find(s)
&& let Some(s) = DIGIT_1PLUS_REG
.find(m.as_str())
.and_then(|s| s.as_str().parse::<i32>().ok())
{
season = s;
break;
}
{
season = s;
break;
}
if let Some(m) = SEASON_EXTRACT_SEASON_ZH_PREFIX_RE.find(s) {
if let Ok(s) = SEASON_EXTRACT_SEASON_ZH_PREFIX_SUB_RE