refactor: rewrite origin name extractor from regex to nom combinators

This commit is contained in:
master 2025-06-19 02:37:56 +08:00
parent c12b9b360a
commit 324427513c
10 changed files with 2241 additions and 900 deletions

418
Cargo.lock generated
View File

@ -933,6 +933,16 @@ dependencies = [
"walkdir",
]
[[package]]
name = "calendrical_calculations"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f6df87e869fb08be61c7e97ced8e69ab802df1d8bc612ed67dba78c07fbc12c"
dependencies = [
"core_maths",
"displaydoc",
]
[[package]]
name = "cc"
version = "1.2.26"
@ -1280,6 +1290,15 @@ version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "core_maths"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77745e017f5edba1a9c1d854f6f3a52dac8a12dd5af5d2f54aecf61e43d80d30"
dependencies = [
"libm",
]
[[package]]
name = "cpufeatures"
version = "0.2.17"
@ -2082,6 +2101,17 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "fixed_decimal"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35943d22b2f19c0cb198ecf915910a8158e94541c89dcc63300d7799d46c2c5e"
dependencies = [
"displaydoc",
"smallvec",
"writeable",
]
[[package]]
name = "flate2"
version = "1.1.2"
@ -2901,6 +2931,103 @@ dependencies = [
"cc",
]
[[package]]
name = "icu"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab13fe39da5da564b88228e9f08815c9d0efbe9ec244e72b149d9994e10f1054"
dependencies = [
"icu_calendar",
"icu_casemap",
"icu_collator",
"icu_collections",
"icu_datetime",
"icu_decimal",
"icu_experimental",
"icu_list",
"icu_locale",
"icu_normalizer",
"icu_pattern",
"icu_plurals",
"icu_properties",
"icu_provider",
"icu_segmenter",
"icu_time",
]
[[package]]
name = "icu_calendar"
version = "2.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7a6ed1ea995a24dff839bc5ca4471ce2fa18ba14d8b09061c2527a46a1c6079"
dependencies = [
"calendrical_calculations",
"displaydoc",
"icu_calendar_data",
"icu_locale",
"icu_locale_core",
"icu_provider",
"ixdtf",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_calendar_data"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7219c8639ab936713a87b571eed2bc2615aa9137e8af6eb221446ee5644acc18"
[[package]]
name = "icu_casemap"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dc5e74b3c9d7b63e0d7c5fd54ee8c135705df2ea2aa558082dd555dc9747a97"
dependencies = [
"displaydoc",
"icu_casemap_data",
"icu_collections",
"icu_locale_core",
"icu_properties",
"icu_provider",
"potential_utf",
"writeable",
"zerovec",
]
[[package]]
name = "icu_casemap_data"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7584067558ab4c60c95d1ac2abd1588689cb4bcd4e099507f62dae86ae8d2c0"
[[package]]
name = "icu_collator"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42ad4c6a556938dfd31f75a8c54141079e8821dc697ffb799cfe0f0fa11f2edc"
dependencies = [
"displaydoc",
"icu_collator_data",
"icu_collections",
"icu_locale",
"icu_locale_core",
"icu_normalizer",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"zerovec",
]
[[package]]
name = "icu_collator_data"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d880b8e680799eabd90c054e1b95526cd48db16c95269f3c89fb3117e1ac92c5"
[[package]]
name = "icu_collections"
version = "2.0.0"
@ -2914,6 +3041,139 @@ dependencies = [
"zerovec",
]
[[package]]
name = "icu_datetime"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0790c15e3d6ae3303365fa2337b4f6469de257916141110d14dcaf73f1d31ac5"
dependencies = [
"displaydoc",
"either",
"fixed_decimal",
"icu_calendar",
"icu_datetime_data",
"icu_decimal",
"icu_locale",
"icu_locale_core",
"icu_pattern",
"icu_plurals",
"icu_provider",
"icu_time",
"potential_utf",
"smallvec",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_datetime_data"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83791ac10bb7b774f130bb81fa89c4059de710dcef53caa0b86e645212d6d54c"
[[package]]
name = "icu_decimal"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fec61c43fdc4e368a9f450272833123a8ef0d7083a44597660ce94d791b8a2e2"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_decimal_data",
"icu_locale",
"icu_locale_core",
"icu_provider",
"serde",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_decimal_data"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b70963bc35f9bdf1bc66a5c1f458f4991c1dc71760e00fa06016b2c76b2738d5"
[[package]]
name = "icu_experimental"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebe3d7e64892a434b08d5a58b53127e47a095ff780305f563c8c01798a1051b0"
dependencies = [
"displaydoc",
"either",
"fixed_decimal",
"icu_casemap",
"icu_collections",
"icu_decimal",
"icu_experimental_data",
"icu_list",
"icu_locale",
"icu_locale_core",
"icu_normalizer",
"icu_pattern",
"icu_plurals",
"icu_properties",
"icu_provider",
"litemap",
"num-bigint",
"num-rational 0.4.2",
"num-traits",
"potential_utf",
"smallvec",
"tinystr",
"writeable",
"zerofrom",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_experimental_data"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b60d32ba5610adfc2083f5a759f55d9a9082ebf72750f126cb1630844eea1acf"
[[package]]
name = "icu_list"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e26f94ec776bb8b28cedc7dcf91033b822c5cb4c1783cf7a3f796fc168aa0c8b"
dependencies = [
"displaydoc",
"icu_list_data",
"icu_locale",
"icu_provider",
"regex-automata 0.4.9",
"serde",
"writeable",
"zerovec",
]
[[package]]
name = "icu_list_data"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a456a2412458ca45e181d9d51c5090ef8cd90f5692e11d34bafab3b3be1c76b"
[[package]]
name = "icu_locale"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ae5921528335e91da1b6c695dbf1ec37df5ac13faa3f91e5640be93aa2fbefd"
dependencies = [
"displaydoc",
"icu_collections",
"icu_locale_core",
"icu_locale_data",
"icu_provider",
"potential_utf",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_locale_core"
version = "2.0.0"
@ -2927,6 +3187,12 @@ dependencies = [
"zerovec",
]
[[package]]
name = "icu_locale_data"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fdef0c124749d06a743c69e938350816554eb63ac979166590e2b4ee4252765"
[[package]]
name = "icu_normalizer"
version = "2.0.0"
@ -2939,6 +3205,9 @@ dependencies = [
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
@ -2948,6 +3217,39 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3"
[[package]]
name = "icu_pattern"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "983825f401e6bc4a13c45d552ffd9ad6f3f6b6bc0ec03f31d6835a90a46deb1f"
dependencies = [
"displaydoc",
"either",
"writeable",
"yoke",
"zerovec",
]
[[package]]
name = "icu_plurals"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fd83a65f58b6f28e1f3da8c6ada6b415ee3ad5cb480b75bdb669f34d72dd179"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_locale",
"icu_plurals_data",
"icu_provider",
"zerovec",
]
[[package]]
name = "icu_plurals_data"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ec552d761eaf4a1c39ad28936e0af77a41bf01ff756ea54be4f8bfc21c265d7"
[[package]]
name = "icu_properties"
version = "2.0.1"
@ -2987,6 +3289,56 @@ dependencies = [
"zerovec",
]
[[package]]
name = "icu_segmenter"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e185fc13b6401c138cf40db12b863b35f5edf31b88192a545857b41aeaf7d3d3"
dependencies = [
"core_maths",
"displaydoc",
"icu_collections",
"icu_locale",
"icu_locale_core",
"icu_provider",
"icu_segmenter_data",
"potential_utf",
"utf8_iter",
"zerovec",
]
[[package]]
name = "icu_segmenter_data"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5360a2fbe97f617c4f8b944356dedb36d423f7da7f13c070995cf89e59f01220"
[[package]]
name = "icu_time"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10d01a4a2dcbc5e5180ef113920e7461d0e9caaddb3567d81c4eca262efe55c0"
dependencies = [
"calendrical_calculations",
"displaydoc",
"icu_calendar",
"icu_locale_core",
"icu_provider",
"icu_time_data",
"ixdtf",
"serde",
"tinystr",
"writeable",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_time_data"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8472be4410d26a03d7208cae3a76c798dd6766e8226ab977cd8b2d349a6dbf08"
[[package]]
name = "ident_case"
version = "1.0.1"
@ -3219,6 +3571,15 @@ version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "ixdtf"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8289f7f711a1a51f80e2e368355d023042ca55d8d554fd5e953f01464c15842d"
dependencies = [
"displaydoc",
]
[[package]]
name = "jobserver"
version = "0.1.33"
@ -4154,6 +4515,15 @@ dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.50.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4a28e057d01f97e61255210fcff094d74ed0466038633e95017f5beb68e4399"
dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "num"
version = "0.2.1"
@ -4163,7 +4533,7 @@ dependencies = [
"num-complex",
"num-integer",
"num-iter",
"num-rational",
"num-rational 0.2.4",
"num-traits",
]
@ -4241,6 +4611,17 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
dependencies = [
"num-bigint",
"num-integer",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.19"
@ -4869,6 +5250,7 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585"
dependencies = [
"serde",
"zerovec",
]
@ -5252,6 +5634,8 @@ dependencies = [
"futures",
"html-escape",
"http",
"icu",
"icu_properties",
"inquire",
"insta",
"ipnetwork",
@ -5265,6 +5649,8 @@ dependencies = [
"mockito",
"moka",
"nanoid",
"nom",
"num-traits",
"once_cell",
"opendal",
"openidconnect",
@ -5296,6 +5682,7 @@ dependencies = [
"tracing",
"tracing-appender",
"tracing-subscriber",
"tracing-tree",
"typed-builder 0.21.0",
"url",
"util",
@ -7456,7 +7843,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
dependencies = [
"matchers",
"nu-ansi-term",
"nu-ansi-term 0.46.0",
"once_cell",
"regex",
"serde",
@ -7470,6 +7857,18 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "tracing-tree"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f459ca79f1b0d5f71c54ddfde6debfc59c8b6eeb46808ae492077f739dc7b49c"
dependencies = [
"nu-ansi-term 0.50.1",
"tracing-core",
"tracing-log",
"tracing-subscriber",
]
[[package]]
name = "try-lock"
version = "0.2.5"
@ -7711,6 +8110,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8-width"
version = "0.1.7"
@ -8296,11 +8701,20 @@ dependencies = [
"bitflags 2.9.1",
]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]]
name = "writeable"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
dependencies = [
"either",
]
[[package]]
name = "wyz"

View File

@ -129,6 +129,11 @@ nanoid = "0.4.0"
jwtk = "0.4.0"
percent-encoding = "2.3.1"
mime_guess = "2.0.5"
nom = "8.0.0"
icu_properties = "2.0.1"
icu = "2.0.0"
num-traits = "0.2.19"
tracing-tree = "0.4.0"
[dev-dependencies]

View File

@ -5,6 +5,7 @@ use std::{
};
use async_graphql::{InputObject, SimpleObject};
use async_stream::try_stream;
use fetch::fetch_bytes;
use futures::{Stream, TryStreamExt, pin_mut, try_join};
use maplit::hashmap;
@ -292,17 +293,19 @@ impl SubscriptionTrait for MikanSeasonSubscription {
}
async fn sync_feeds_incremental(&self, ctx: Arc<dyn AppContextTrait>) -> RecorderResult<()> {
let rss_item_list = self
.get_rss_item_list_from_subsribed_url_rss_link(ctx.as_ref())
.await?;
let rss_item_stream = self.get_rss_item_stream_from_subsribed_url_rss_link(ctx.as_ref());
sync_mikan_feeds_from_rss_item_list(
ctx.as_ref(),
rss_item_list,
self.get_subscriber_id(),
self.get_subscription_id(),
)
.await?;
pin_mut!(rss_item_stream);
while let Some(rss_item_chunk_list) = rss_item_stream.try_next().await? {
sync_mikan_feeds_from_rss_item_list(
ctx.as_ref(),
rss_item_chunk_list,
self.get_subscriber_id(),
self.get_subscription_id(),
)
.await?;
}
Ok(())
}
@ -393,48 +396,53 @@ impl MikanSeasonSubscription {
)
}
#[tracing::instrument(err, skip(ctx))]
async fn get_rss_item_list_from_subsribed_url_rss_link(
fn get_rss_item_stream_from_subsribed_url_rss_link(
&self,
ctx: &dyn AppContextTrait,
) -> RecorderResult<Vec<MikanRssEpisodeItem>> {
let db = ctx.db();
) -> impl Stream<Item = RecorderResult<Vec<MikanRssEpisodeItem>>> {
try_stream! {
let subscribed_bangumi_list = bangumi::Entity::find()
.filter(
Condition::all()
.add(subscription_bangumi::Column::SubscriptionId.eq(self.subscription_id)),
)
.join_rev(
JoinType::InnerJoin,
subscription_bangumi::Relation::Bangumi.def(),
)
.all(db)
.await?;
let db = ctx.db();
let mut rss_item_list = vec![];
for subscribed_bangumi in subscribed_bangumi_list {
let rss_url = subscribed_bangumi
.rss_link
.with_whatever_context::<_, String, RecorderError>(|| {
format!(
"rss_link is required, subscription_id = {}, bangumi_name = {}",
self.subscription_id, subscribed_bangumi.display_name
)
})?;
let bytes = fetch_bytes(ctx.mikan(), rss_url).await?;
let subscribed_bangumi_list = bangumi::Entity::find()
.filter(
Condition::all()
.add(subscription_bangumi::Column::SubscriptionId.eq(self.subscription_id)),
)
.join_rev(
JoinType::InnerJoin,
subscription_bangumi::Relation::Bangumi.def(),
)
.all(db)
.await?;
let channel = rss::Channel::read_from(&bytes[..])?;
for (idx, item) in channel.items.into_iter().enumerate() {
let item = MikanRssEpisodeItem::try_from(item)
.with_whatever_context::<_, String, RecorderError>(|_| {
format!("failed to extract rss item at idx {idx}")
for subscribed_bangumi in subscribed_bangumi_list {
let rss_url = subscribed_bangumi
.rss_link
.with_whatever_context::<_, String, RecorderError>(|| {
format!(
"rss_link is required, subscription_id = {}, bangumi_name = {}",
self.subscription_id, subscribed_bangumi.display_name
)
})?;
rss_item_list.push(item);
let bytes = fetch_bytes(ctx.mikan(), rss_url).await?;
let channel = rss::Channel::read_from(&bytes[..])?;
let mut rss_item_list = vec![];
for (idx, item) in channel.items.into_iter().enumerate() {
let item = MikanRssEpisodeItem::try_from(item)
.with_whatever_context::<_, String, RecorderError>(|_| {
format!("failed to extract rss item at idx {idx}")
})?;
rss_item_list.push(item);
}
yield rss_item_list;
}
}
Ok(rss_item_list)
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,845 +0,0 @@
/**
* @TODO: rewrite with nom
*/
use std::borrow::Cow;
use itertools::Itertools;
use lazy_static::lazy_static;
use regex::Regex;
use serde::{Deserialize, Serialize};
use snafu::whatever;
use crate::{
errors::RecorderResult,
extract::defs::{DIGIT_1PLUS_REG, ZH_NUM_MAP, ZH_NUM_RE},
};
const NAME_EXTRACT_REPLACE_ADHOC1_REPLACED: &str = "$1/$2";
lazy_static! {
static ref TITLE_RE: Regex = Regex::new(
r#"(.*|\[.*])( -? \d+|\[\d+]|\[\d+.?[vV]\d]|第\d+[话話集]|\[第?\d+[话話集]]|\[\d+.?END]|[Ee][Pp]?\d+|\[\s*\d+\s*[\-\~]\s*\d+\s*\p{scx=Han}*[话話集]\s*])(.*)"#
).unwrap();
static ref EP_COLLECTION_RE:Regex = Regex::new(r#"\[?\s*\d+\s*[\-\~]\s*\d+\s*\p{scx=Han}*合?[话話集]\s*]?"#).unwrap();
static ref MOVIE_TITLE_RE:Regex = Regex::new(r#"(.*|\[.*])(剧场版|[Mm]ovie|电影)(.*?)$"#).unwrap();
static ref RESOLUTION_RE: Regex = Regex::new(r"1080|720|2160|4K|2K").unwrap();
static ref SOURCE_L1_RE: Regex = Regex::new(r"B-Global|[Bb]aha|[Bb]ilibili|AT-X|W[Ee][Bb][Rr][Ii][Pp]|Sentai|B[Dd][Rr][Ii][Pp]|UHD[Rr][Ii][Pp]|NETFLIX").unwrap();
static ref SOURCE_L2_RE: Regex = Regex::new(r"AMZ|CR|W[Ee][Bb]|B[Dd]").unwrap();
static ref SUB_RE: Regex = Regex::new(r"[简繁日字幕]|CH|BIG5|GB").unwrap();
static ref PREFIX_RE: Regex =
Regex::new(r"[^\w\s\p{Unified_Ideograph}\p{scx=Han}\p{scx=Hira}\p{scx=Kana}-]").unwrap();
static ref EN_BRACKET_SPLIT_RE: Regex = Regex::new(r"[\[\]]").unwrap();
static ref MOVIE_SEASON_EXTRACT_RE: Regex = Regex::new(r"剧场版|Movie|电影").unwrap();
static ref MAIN_TITLE_PREFIX_PROCESS_RE1: Regex = Regex::new(r"新番|月?番").unwrap();
static ref MAIN_TITLE_PREFIX_PROCESS_RE2: Regex = Regex::new(r"[港澳台]{1,3}地区").unwrap();
static ref MAIN_TITLE_PRE_PROCESS_BACKETS_RE: Regex = Regex::new(r"\[.+\]").unwrap();
static ref MAIN_TITLE_PRE_PROCESS_BACKETS_RE_SUB1: Regex = Regex::new(r"^.*?\[").unwrap();
static ref SEASON_EXTRACT_SEASON_ALL_RE: Regex = Regex::new(r"S\d{1,2}|Season \d{1,2}|[第].[季期]|1st|2nd|3rd|\d{1,2}th").unwrap();
static ref SEASON_EXTRACT_SEASON_EN_PREFIX_RE: Regex = Regex::new(r"Season|S").unwrap();
static ref SEASON_EXTRACT_SEASON_EN_NTH_RE: Regex = Regex::new(r"1st|2nd|3rd|\d{1,2}th").unwrap();
static ref SEASON_EXTRACT_SEASON_ZH_PREFIX_RE: Regex = Regex::new(r"[第 ].*[季期(部分)]|部分").unwrap();
static ref SEASON_EXTRACT_SEASON_ZH_PREFIX_SUB_RE: Regex = Regex::new(r"[第季期 ]").unwrap();
static ref NAME_EXTRACT_REMOVE_RE: Regex = Regex::new(r"[(]仅限[港澳台]{1,3}地区[)]").unwrap();
static ref NAME_EXTRACT_SPLIT_RE: Regex = Regex::new(r"/|\s{2}|-\s{2}|\]\[").unwrap();
static ref NAME_EXTRACT_REPLACE_ADHOC1_RE: Regex = Regex::new(r"([\p{scx=Han}\s\(\)]{5,})_([a-zA-Z]{2,})").unwrap();
static ref NAME_JP_TEST: Regex = Regex::new(r"[\p{scx=Hira}\p{scx=Kana}]{2,}").unwrap();
static ref NAME_ZH_TEST: Regex = Regex::new(r"[\p{scx=Han}]{2,}").unwrap();
static ref NAME_EN_TEST: Regex = Regex::new(r"[a-zA-Z]{3,}").unwrap();
static ref TAGS_EXTRACT_SPLIT_RE: Regex = Regex::new(r"[\[\]()_]").unwrap();
static ref CLEAR_SUB_RE: Regex = Regex::new(r"_MP4|_MKV").unwrap();
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
pub struct RawEpisodeMeta {
pub name_en: Option<String>,
pub name_en_no_season: Option<String>,
pub name_jp: Option<String>,
pub name_jp_no_season: Option<String>,
pub name_zh: Option<String>,
pub name_zh_no_season: Option<String>,
pub season: i32,
pub season_raw: Option<String>,
pub episode_index: i32,
pub subtitle: Option<String>,
pub source: Option<String>,
pub fansub: Option<String>,
pub resolution: Option<String>,
}
fn extract_fansub(raw_name: &str) -> Option<&str> {
let mut groups = EN_BRACKET_SPLIT_RE.splitn(raw_name, 3);
groups.nth(1)
}
fn replace_ch_bracket_to_en(raw_name: &str) -> String {
raw_name.replace('【', "[").replace('】', "]")
}
fn title_body_pre_process(title_body: &str, fansub: Option<&str>) -> RecorderResult<String> {
let raw_without_fansub = if let Some(fansub) = fansub {
let fan_sub_re = Regex::new(&format!(".{fansub}."))?;
fan_sub_re.replace_all(title_body, "")
} else {
Cow::Borrowed(title_body)
};
let raw_with_prefix_replaced = PREFIX_RE.replace_all(&raw_without_fansub, "/");
let mut arg_group = raw_with_prefix_replaced
.split('/')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>();
if arg_group.len() == 1 {
arg_group = arg_group.first_mut().unwrap().split(' ').collect();
}
let mut raw = raw_without_fansub.to_string();
for arg in arg_group.iter() {
if (arg_group.len() <= 5 && MAIN_TITLE_PREFIX_PROCESS_RE1.is_match(arg))
|| (MAIN_TITLE_PREFIX_PROCESS_RE2.is_match(arg))
{
let sub = Regex::new(&format!(".{arg}."))?;
raw = sub.replace_all(&raw, "").to_string();
}
}
if let Some(m) = MAIN_TITLE_PRE_PROCESS_BACKETS_RE.find(&raw)
&& m.len() as f32 > (raw.len() as f32) * 0.5
{
let mut raw1 = MAIN_TITLE_PRE_PROCESS_BACKETS_RE_SUB1
.replace(&raw, "")
.chars()
.collect_vec();
while let Some(ch) = raw1.pop() {
if ch == ']' {
break;
}
}
raw = raw1.into_iter().collect();
}
Ok(raw.to_string())
}
pub fn extract_season_from_title_body(title_body: &str) -> (String, Option<String>, i32) {
let name_and_season = EN_BRACKET_SPLIT_RE.replace_all(title_body, " ");
let seasons = SEASON_EXTRACT_SEASON_ALL_RE
.find(&name_and_season)
.into_iter()
.map(|s| s.as_str())
.collect_vec();
if seasons.is_empty() {
return (title_body.to_string(), None, 1);
}
let mut season = 1;
let mut season_raw = None;
let name = SEASON_EXTRACT_SEASON_ALL_RE.replace_all(&name_and_season, "");
for s in seasons {
season_raw = Some(s);
if let Some(m) = SEASON_EXTRACT_SEASON_EN_PREFIX_RE.find(s)
&& let Ok(s) = SEASON_EXTRACT_SEASON_ALL_RE
.replace_all(m.as_str(), "")
.parse::<i32>()
{
season = s;
break;
}
if let Some(m) = SEASON_EXTRACT_SEASON_EN_NTH_RE.find(s)
&& let Some(s) = DIGIT_1PLUS_REG
.find(m.as_str())
.and_then(|s| s.as_str().parse::<i32>().ok())
{
season = s;
break;
}
if let Some(m) = SEASON_EXTRACT_SEASON_ZH_PREFIX_RE.find(s) {
if let Ok(s) = SEASON_EXTRACT_SEASON_ZH_PREFIX_SUB_RE
.replace(m.as_str(), "")
.parse::<i32>()
{
season = s;
break;
}
if let Some(m) = ZH_NUM_RE.find(m.as_str()) {
season = ZH_NUM_MAP[m.as_str()];
break;
}
}
}
(name.to_string(), season_raw.map(|s| s.to_string()), season)
}
fn extract_name_from_title_body_name_section(
title_body_name_section: &str,
) -> (Option<String>, Option<String>, Option<String>) {
let mut name_en = None;
let mut name_zh = None;
let mut name_jp = None;
let replaced1 = NAME_EXTRACT_REMOVE_RE.replace_all(title_body_name_section, "");
let replaced2 = NAME_EXTRACT_REPLACE_ADHOC1_RE
.replace_all(&replaced1, NAME_EXTRACT_REPLACE_ADHOC1_REPLACED);
let trimmed = replaced2.trim();
let mut split = NAME_EXTRACT_SPLIT_RE
.split(trimmed)
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect_vec();
if split.len() == 1 {
let mut split_space = split[0].split(' ').collect_vec();
let mut search_indices = vec![0];
if split_space.len() > 1 {
search_indices.push(split_space.len() - 1);
}
for i in search_indices {
if NAME_ZH_TEST.is_match(split_space[i]) {
let chs = split_space[i];
split_space.remove(i);
split = vec![chs.to_string(), split_space.join(" ")];
break;
}
}
}
for item in split {
if NAME_JP_TEST.is_match(&item) && name_jp.is_none() {
name_jp = Some(item);
} else if NAME_ZH_TEST.is_match(&item) && name_zh.is_none() {
name_zh = Some(item);
} else if NAME_EN_TEST.is_match(&item) && name_en.is_none() {
name_en = Some(item);
}
}
(name_en, name_zh, name_jp)
}
fn extract_episode_index_from_title_episode(title_episode: &str) -> Option<i32> {
DIGIT_1PLUS_REG
.find(title_episode)?
.as_str()
.parse::<i32>()
.ok()
}
fn clear_sub(sub: Option<String>) -> Option<String> {
sub.map(|s| CLEAR_SUB_RE.replace_all(&s, "").to_string())
}
fn extract_tags_from_title_extra(
title_extra: &str,
) -> (Option<String>, Option<String>, Option<String>) {
let replaced = TAGS_EXTRACT_SPLIT_RE.replace_all(title_extra, " ");
let elements = replaced
.split(' ')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.collect_vec();
let mut sub = None;
let mut resolution = None;
let mut source = None;
for element in elements.iter() {
if SUB_RE.is_match(element) {
sub = Some(element.to_string())
} else if RESOLUTION_RE.is_match(element) {
resolution = Some(element.to_string())
} else if SOURCE_L1_RE.is_match(element) {
source = Some(element.to_string())
}
}
if source.is_none() {
for element in elements {
if SOURCE_L2_RE.is_match(element) {
source = Some(element.to_string())
}
}
}
(clear_sub(sub), resolution, source)
}
pub fn check_is_movie(title: &str) -> bool {
MOVIE_TITLE_RE.is_match(title)
}
pub fn extract_episode_meta_from_origin_name(s: &str) -> RecorderResult<RawEpisodeMeta> {
let raw_title = s.trim();
let raw_title_without_ch_brackets = replace_ch_bracket_to_en(raw_title);
let fansub = extract_fansub(&raw_title_without_ch_brackets);
let movie_capture = check_is_movie(&raw_title_without_ch_brackets);
if let Some(title_re_match_obj) = MOVIE_TITLE_RE
.captures(&raw_title_without_ch_brackets)
.or(TITLE_RE.captures(&raw_title_without_ch_brackets))
{
let mut title_body = title_re_match_obj
.get(1)
.map(|s| s.as_str().trim())
.unwrap_or_else(|| unreachable!("TITLE_RE has at least 3 capture groups"))
.to_string();
let mut title_episode = title_re_match_obj
.get(2)
.map(|s| s.as_str().trim())
.unwrap_or_else(|| unreachable!("TITLE_RE has at least 3 capture groups"));
let title_extra = title_re_match_obj
.get(3)
.map(|s| s.as_str().trim())
.unwrap_or_else(|| unreachable!("TITLE_RE has at least 3 capture groups"));
if movie_capture {
title_body += title_episode;
title_episode = "";
} else if EP_COLLECTION_RE.is_match(title_episode) {
title_episode = "";
}
let title_body = title_body_pre_process(&title_body, fansub)?;
let (name_without_season, season_raw, season) = extract_season_from_title_body(&title_body);
let (name_en, name_zh, name_jp) = extract_name_from_title_body_name_section(&title_body);
let (name_en_no_season, name_zh_no_season, name_jp_no_season) =
extract_name_from_title_body_name_section(&name_without_season);
let episode_index = extract_episode_index_from_title_episode(title_episode).unwrap_or(1);
let (sub, resolution, source) = extract_tags_from_title_extra(title_extra);
Ok(RawEpisodeMeta {
name_en,
name_en_no_season,
name_jp,
name_jp_no_season,
name_zh,
name_zh_no_season,
season,
season_raw,
episode_index,
subtitle: sub,
source,
fansub: fansub.map(|s| s.to_string()),
resolution,
})
} else {
whatever!("Can not parse episode meta from raw filename {}", raw_title)
}
}
#[cfg(test)]
mod tests {
use super::{RawEpisodeMeta, extract_episode_meta_from_origin_name};
fn test_raw_ep_parser_case(raw_name: &str, expected: &str) {
let expected: Option<RawEpisodeMeta> = serde_json::from_str(expected).unwrap_or_default();
let found = extract_episode_meta_from_origin_name(raw_name).ok();
if expected != found {
println!(
"expected {} and found {} are not equal",
serde_json::to_string_pretty(&expected).unwrap(),
serde_json::to_string_pretty(&found).unwrap()
)
}
assert_eq!(expected, found);
}
#[test]
fn test_parse_ep_with_all_parts_wrapped() {
test_raw_ep_parser_case(
r#"[新Sub][1月新番][我心里危险的东西 第二季][05][HEVC][10Bit][1080P][简日双语][招募翻译]"#,
r#"{
"name_zh": "我心里危险的东西",
"name_zh_no_season": "我心里危险的东西",
"season": 2,
"season_raw": "第二季",
"episode_index": 5,
"subtitle": "简日双语",
"source": null,
"fansub": "新Sub",
"resolution": "1080P"
}"#,
)
}
#[test]
fn test_parse_ep_with_title_wrapped_by_one_square_bracket_and_season_prefix() {
test_raw_ep_parser_case(
r#"【喵萌奶茶屋】★01月新番★[我内心的糟糕念头 / Boku no Kokoro no Yabai Yatsu][18][1080p][简日双语][招募翻译]"#,
r#"{
"name_en": "Boku no Kokoro no Yabai Yatsu",
"name_en_no_season": "Boku no Kokoro no Yabai Yatsu",
"name_zh": "我内心的糟糕念头",
"name_zh_no_season": "我内心的糟糕念头",
"season": 1,
"season_raw": null,
"episode_index": 18,
"subtitle": "简日双语",
"source": null,
"fansub": "喵萌奶茶屋",
"resolution": "1080p"
}"#,
);
}
#[test]
fn test_parse_ep_with_ep_and_version() {
test_raw_ep_parser_case(
r#"[LoliHouse] 因为不是真正的伙伴而被逐出勇者队伍,流落到边境展开慢活人生 2nd / Shin no Nakama 2nd - 08v2 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕]"#,
r#"{
"name_en": "Shin no Nakama 2nd",
"name_en_no_season": "Shin no Nakama",
"name_zh": "因为不是真正的伙伴而被逐出勇者队伍,流落到边境展开慢活人生 2nd",
"name_zh_no_season": "因为不是真正的伙伴而被逐出勇者队伍,流落到边境展开慢活人生",
"season": 2,
"season_raw": "2nd",
"episode_index": 8,
"subtitle": "简繁内封字幕",
"source": "WebRip",
"fansub": "LoliHouse",
"resolution": "1080p"
}"#,
)
}
#[test]
fn test_parse_ep_with_en_title_only() {
test_raw_ep_parser_case(
r"[动漫国字幕组&LoliHouse] THE MARGINAL SERVICE - 08 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕]",
r#"{
"name_en": "THE MARGINAL SERVICE",
"name_en_no_season": "THE MARGINAL SERVICE",
"season": 1,
"episode_index": 8,
"subtitle": "简繁内封字幕",
"source": "WebRip",
"fansub": "动漫国字幕组&LoliHouse",
"resolution": "1080p"
}"#,
)
}
#[test]
fn test_parse_ep_with_two_zh_title() {
test_raw_ep_parser_case(
r#"[LoliHouse] 事与愿违的不死冒险者 / 非自愿的不死冒险者 / Nozomanu Fushi no Boukensha - 01 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕]"#,
r#"{
"name_en": "Nozomanu Fushi no Boukensha",
"name_en_no_season": "Nozomanu Fushi no Boukensha",
"name_zh": "事与愿违的不死冒险者",
"name_zh_no_season": "事与愿违的不死冒险者",
"season": 1,
"season_raw": null,
"episode_index": 1,
"subtitle": "简繁内封字幕",
"source": "WebRip",
"fansub": "LoliHouse",
"resolution": "1080p"
}"#,
)
}
#[test]
fn test_parse_ep_with_en_zh_jp_titles() {
test_raw_ep_parser_case(
r#"[喵萌奶茶屋&LoliHouse] 碰之道 / ぽんのみち / Pon no Michi - 07 [WebRip 1080p HEVC-10bit AAC][简繁日内封字幕]"#,
r#"{
"name_en": "Pon no Michi",
"name_jp": "ぽんのみち",
"name_zh": "碰之道",
"name_en_no_season": "Pon no Michi",
"name_jp_no_season": "ぽんのみち",
"name_zh_no_season": "碰之道",
"season": 1,
"season_raw": null,
"episode_index": 7,
"subtitle": "简繁日内封字幕",
"source": "WebRip",
"fansub": "喵萌奶茶屋&LoliHouse",
"resolution": "1080p"
}"#,
)
}
#[test]
fn test_parse_ep_with_nth_season() {
test_raw_ep_parser_case(
r#"[ANi] Yowai Character Tomozakikun / 弱角友崎同学 2nd STAGE - 09 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]"#,
r#"{
"name_en": "Yowai Character Tomozakikun",
"name_en_no_season": "Yowai Character Tomozakikun",
"name_zh": "弱角友崎同学 2nd STAGE",
"name_zh_no_season": "弱角友崎同学",
"season": 2,
"season_raw": "2nd",
"episode_index": 9,
"subtitle": "CHT",
"source": "Baha",
"fansub": "ANi",
"resolution": "1080P"
}"#,
)
}
#[test]
fn test_parse_ep_with_season_en_and_season_zh() {
test_raw_ep_parser_case(
r#"[豌豆字幕组&LoliHouse] 王者天下 第五季 / Kingdom S5 - 07 [WebRip 1080p HEVC-10bit AAC][简繁外挂字幕]"#,
r#"{
"name_en": "Kingdom S5",
"name_en_no_season": "Kingdom",
"name_zh": "王者天下 第五季",
"name_zh_no_season": "王者天下",
"season": 5,
"season_raw": "第五季",
"episode_index": 7,
"subtitle": "简繁外挂字幕",
"source": "WebRip",
"fansub": "豌豆字幕组&LoliHouse",
"resolution": "1080p"
}"#,
)
}
#[test]
fn test_parse_ep_with_airota_fansub_style_case1() {
test_raw_ep_parser_case(
r#"【千夏字幕组】【爱丽丝与特蕾丝的虚幻工厂_Alice to Therese no Maboroshi Koujou】[剧场版][WebRip_1080p_HEVC][简繁内封][招募新人]"#,
r#"{
"name_en": "Alice to Therese no Maboroshi Koujou",
"name_en_no_season": "Alice to Therese no Maboroshi Koujou",
"name_zh": "爱丽丝与特蕾丝的虚幻工厂",
"name_zh_no_season": "爱丽丝与特蕾丝的虚幻工厂",
"season": 1,
"episode_index": 1,
"subtitle": "简繁内封",
"source": "WebRip",
"fansub": "千夏字幕组",
"resolution": "1080p"
}"#,
)
}
#[test]
fn test_parse_ep_with_airota_fansub_style_case2() {
test_raw_ep_parser_case(
r#"[千夏字幕组&喵萌奶茶屋][电影 轻旅轻营 (摇曳露营) _Yuru Camp Movie][剧场版][UHDRip_2160p_HEVC][繁体][千夏15周年]"#,
r#"{
"name_en": "Yuru Camp Movie",
"name_en_no_season": "Yuru Camp Movie",
"name_zh": "电影 轻旅轻营 (摇曳露营)",
"name_zh_no_season": "电影 轻旅轻营 (摇曳露营)",
"season": 1,
"episode_index": 1,
"subtitle": "繁体",
"source": "UHDRip",
"fansub": "千夏字幕组&喵萌奶茶屋",
"resolution": "2160p"
}"#,
)
}
#[test]
fn test_parse_ep_with_large_episode_style() {
test_raw_ep_parser_case(
r#"[梦蓝字幕组]New Doraemon 哆啦A梦新番[747][2023.02.25][AVC][1080P][GB_JP][MP4]"#,
r#"{
"name_en": "New Doraemon",
"name_en_no_season": "New Doraemon",
"name_zh": "哆啦A梦新番",
"name_zh_no_season": "哆啦A梦新番",
"season": 1,
"episode_index": 747,
"subtitle": "GB",
"fansub": "梦蓝字幕组",
"resolution": "1080P"
}"#,
)
}
#[test]
fn test_parse_ep_with_many_square_brackets_split_title() {
test_raw_ep_parser_case(
r#"【MCE汉化组】[剧场版-摇曳露营][Yuru Camp][Movie][简日双语][1080P][x264 AAC]"#,
r#"{
"name_en": "Yuru Camp",
"name_en_no_season": "Yuru Camp",
"name_zh": "剧场版-摇曳露营",
"name_zh_no_season": "剧场版-摇曳露营",
"season": 1,
"episode_index": 1,
"subtitle": "简日双语",
"fansub": "MCE汉化组",
"resolution": "1080P"
}"#,
)
}
#[test]
fn test_parse_ep_with_implicit_lang_title_sep() {
test_raw_ep_parser_case(
r#"[织梦字幕组][尼尔:机械纪元 NieR Automata Ver1.1a][02集][1080P][AVC][简日双语]"#,
r#"{
"name_en": "NieR Automata Ver1.1a",
"name_en_no_season": "NieR Automata Ver1.1a",
"name_zh": "尼尔:机械纪元",
"name_zh_no_season": "尼尔:机械纪元",
"season": 1,
"episode_index": 2,
"subtitle": "简日双语",
"fansub": "织梦字幕组",
"resolution": "1080P"
}"#,
)
}
#[test]
fn test_parse_ep_with_square_brackets_wrapped_and_space_split() {
test_raw_ep_parser_case(
r#"[天月搬运组][迷宫饭 Delicious in Dungeon][03][日语中字][MKV][1080P][NETFLIX][高画质版]"#,
r#"
{
"name_en": "Delicious in Dungeon",
"name_en_no_season": "Delicious in Dungeon",
"name_zh": "迷宫饭",
"name_zh_no_season": "迷宫饭",
"season": 1,
"episode_index": 3,
"subtitle": "日语中字",
"source": "NETFLIX",
"fansub": "天月搬运组",
"resolution": "1080P"
}
"#,
)
}
#[test]
fn test_parse_ep_with_start_with_brackets_wrapped_season_info_prefix() {
test_raw_ep_parser_case(
r#"[爱恋字幕社][1月新番][迷宫饭][Dungeon Meshi][01][1080P][MP4][简日双语] "#,
r#"{
"name_en": "Dungeon Meshi",
"name_en_no_season": "Dungeon Meshi",
"name_zh": "迷宫饭",
"name_zh_no_season": "迷宫饭",
"season": 1,
"episode_index": 1,
"subtitle": "简日双语",
"fansub": "爱恋字幕社",
"resolution": "1080P"
}"#,
)
}
#[test]
fn test_parse_ep_with_small_no_title_extra_brackets_case() {
test_raw_ep_parser_case(
r#"[ANi] Mahou Shoujo ni Akogarete / 梦想成为魔法少女 [年龄限制版] - 09 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]"#,
r#"{
"name_en": "Mahou Shoujo ni Akogarete",
"name_en_no_season": "Mahou Shoujo ni Akogarete",
"name_zh": "梦想成为魔法少女 [年龄限制版]",
"name_zh_no_season": "梦想成为魔法少女 [年龄限制版]",
"season": 1,
"episode_index": 9,
"subtitle": "CHT",
"source": "Baha",
"fansub": "ANi",
"resolution": "1080P"
}"#,
)
}
#[test]
fn test_parse_ep_title_leading_space_style() {
test_raw_ep_parser_case(
r#"[ANi] 16bit 的感动 ANOTHER LAYER - 01 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]"#,
r#"{
"name_zh": "16bit 的感动 ANOTHER LAYER",
"name_zh_no_season": "16bit 的感动 ANOTHER LAYER",
"season": 1,
"season_raw": null,
"episode_index": 1,
"subtitle": "CHT",
"source": "Baha",
"fansub": "ANi",
"resolution": "1080P"
}"#,
)
}
#[test]
fn test_parse_ep_title_leading_month_and_wrapped_brackets_style() {
test_raw_ep_parser_case(
r#"【喵萌奶茶屋】★07月新番★[银砂糖师与黑妖精 ~ Sugar Apple Fairy Tale ~][13][1080p][简日双语][招募翻译]"#,
r#"{
"name_en": "~ Sugar Apple Fairy Tale ~",
"name_en_no_season": "~ Sugar Apple Fairy Tale ~",
"name_zh": "银砂糖师与黑妖精",
"name_zh_no_season": "银砂糖师与黑妖精",
"season": 1,
"episode_index": 13,
"subtitle": "简日双语",
"fansub": "喵萌奶茶屋",
"resolution": "1080p"
}"#,
)
}
#[test]
fn test_parse_ep_title_leading_month_style() {
test_raw_ep_parser_case(
r#"【极影字幕社】★4月新番 天国大魔境 Tengoku Daimakyou 第05话 GB 720P MP4字幕社招人内详"#,
r#"{
"name_en": "Tengoku Daimakyou",
"name_en_no_season": "Tengoku Daimakyou",
"name_zh": "天国大魔境",
"name_zh_no_season": "天国大魔境",
"season": 1,
"episode_index": 5,
"subtitle": "字幕社招人内详",
"source": null,
"fansub": "极影字幕社",
"resolution": "720P"
}"#,
)
}
#[test]
fn test_parse_ep_tokusatsu_style() {
test_raw_ep_parser_case(
r#"[MagicStar] 假面骑士Geats / 仮面ライダーギーツ EP33 [WEBDL] [1080p] [TTFC]【生】"#,
r#"{
"name_jp": "仮面ライダーギーツ",
"name_jp_no_season": "仮面ライダーギーツ",
"name_zh": "假面骑士Geats",
"name_zh_no_season": "假面骑士Geats",
"season": 1,
"episode_index": 33,
"source": "WEBDL",
"fansub": "MagicStar",
"resolution": "1080p"
}"#,
)
}
#[test]
fn test_parse_ep_with_multi_lang_zh_title() {
test_raw_ep_parser_case(
r#"[百冬练习组&LoliHouse] BanG Dream! 少女乐团派对☆PICO FEVER / Garupa Pico: Fever! - 26 [WebRip 1080p HEVC-10bit AAC][简繁内封字幕][END] [101.69 MB]"#,
r#"{
"name_en": "Garupa Pico: Fever!",
"name_en_no_season": "Garupa Pico: Fever!",
"name_zh": "BanG Dream! 少女乐团派对☆PICO FEVER",
"name_zh_no_season": "BanG Dream! 少女乐团派对☆PICO FEVER",
"season": 1,
"episode_index": 26,
"subtitle": "简繁内封字幕",
"source": "WebRip",
"fansub": "百冬练习组&LoliHouse",
"resolution": "1080p"
}"#,
)
}
#[test]
fn test_ep_collections() {
test_raw_ep_parser_case(
r#"[奶²&LoliHouse] 蘑菇狗 / Kinokoinu: Mushroom Pup [01-12 精校合集][WebRip 1080p HEVC-10bit AAC][简日内封字幕]"#,
r#"{
"name_en": "Kinokoinu: Mushroom Pup",
"name_en_no_season": "Kinokoinu: Mushroom Pup",
"name_zh": "蘑菇狗",
"name_zh_no_season": "蘑菇狗",
"season": 1,
"episode_index": 1,
"subtitle": "简日内封字幕",
"source": "WebRip",
"fansub": "奶²&LoliHouse",
"resolution": "1080p",
"name": " 蘑菇狗 / Kinokoinu: Mushroom Pup [01-12 精校合集]"
}"#,
);
test_raw_ep_parser_case(
r#"[LoliHouse] 叹气的亡灵想隐退 / Nageki no Bourei wa Intai shitai [01-13 合集][WebRip 1080p HEVC-10bit AAC][简繁内封字幕][Fin]"#,
r#"{
"name_en": "Nageki no Bourei wa Intai shitai",
"name_en_no_season": "Nageki no Bourei wa Intai shitai",
"name_jp": null,
"name_jp_no_season": null,
"name_zh": "叹气的亡灵想隐退",
"name_zh_no_season": "叹气的亡灵想隐退",
"season": 1,
"season_raw": null,
"episode_index": 1,
"subtitle": "简繁内封字幕",
"source": "WebRip",
"fansub": "LoliHouse",
"resolution": "1080p"
}"#,
);
test_raw_ep_parser_case(
r#"[LoliHouse] 精灵幻想记 第二季 / Seirei Gensouki S2 [01-12 合集][WebRip 1080p HEVC-10bit AAC][简繁内封字幕][Fin]"#,
r#"{
"name_en": "Seirei Gensouki S2",
"name_en_no_season": "Seirei Gensouki",
"name_zh": "精灵幻想记 第二季",
"name_zh_no_season": "精灵幻想记",
"season": 2,
"season_raw": "第二季",
"episode_index": 1,
"subtitle": "简繁内封字幕",
"source": "WebRip",
"fansub": "LoliHouse",
"resolution": "1080p"
}"#,
);
test_raw_ep_parser_case(
r#"[喵萌奶茶屋&LoliHouse] 超自然武装当哒当 / 胆大党 / Dandadan [01-12 精校合集][WebRip 1080p HEVC-10bit AAC][简繁日内封字幕][Fin]"#,
r#" {
"name_en": "Dandadan",
"name_en_no_season": "Dandadan",
"name_zh": "超自然武装当哒当",
"name_zh_no_season": "超自然武装当哒当",
"season": 1,
"episode_index": 1,
"subtitle": "简繁日内封字幕",
"source": "WebRip",
"fansub": "喵萌奶茶屋&LoliHouse",
"resolution": "1080p"
}"#,
);
}
// TODO: FIXME
#[test]
fn test_bad_cases() {
test_raw_ep_parser_case(
r#"[7³ACG x 桜都字幕组] 摇曳露营△ 剧场版/映画 ゆるキャン△/Eiga Yuru Camp△ [简繁字幕] BDrip 1080p x265 FLAC 2.0"#,
r#"{
"name_zh": "摇曳露营△剧场版",
"name_zh_no_season": "摇曳露营△剧场版",
"season": 1,
"season_raw": null,
"episode_index": 1,
"subtitle": "简繁字幕",
"source": "BDrip",
"fansub": "7³ACG x 桜都字幕组",
"resolution": "1080p"
}"#,
);
test_raw_ep_parser_case(
r#"【幻樱字幕组】【4月新番】【古见同学有交流障碍症 第二季 Komi-san wa, Komyushou Desu. S02】【22】【GB_MP4】【1920X1080】"#,
r#"{
"name_en": "第二季 Komi-san wa, Komyushou Desu. S02",
"name_en_no_season": "Komi-san wa, Komyushou Desu.",
"name_zh": "古见同学有交流障碍症",
"name_zh_no_season": "古见同学有交流障碍症",
"season": 2,
"season_raw": "第二季",
"episode_index": 22,
"subtitle": "GB",
"fansub": "幻樱字幕组",
"resolution": "1920X1080"
}"#,
);
}
}

View File

@ -17,7 +17,7 @@ use crate::{
MikanBangumiHash, MikanBangumiMeta, build_mikan_bangumi_subscription_rss_url,
scrape_mikan_poster_meta_from_image_url,
},
origin::extract_season_from_title_body,
origin::{OriginCompTrait, SeasonComp},
},
};
@ -123,7 +123,11 @@ impl ActiveModel {
let mikan_client = ctx.mikan();
let storage_service = ctx.storage();
let mikan_base_url = mikan_client.base_url();
let (_, season_raw, season_index) = extract_season_from_title_body(&meta.bangumi_title);
let season_comp = SeasonComp::parse_comp(&meta.bangumi_title)
.ok()
.map(|(_, s)| s);
let season_index = season_comp.as_ref().map(|s| s.num).unwrap_or(1);
let season_raw = season_comp.map(|s| s.source.into_owned());
let rss_url = build_mikan_bangumi_subscription_rss_url(
mikan_base_url.clone(),

View File

@ -10,7 +10,7 @@ use crate::{
errors::RecorderResult,
extract::{
mikan::{MikanEpisodeHash, MikanEpisodeMeta, build_mikan_episode_homepage_url},
origin::extract_episode_meta_from_origin_name,
origin::{OriginCompTrait, OriginNameRoot},
},
};
@ -124,7 +124,7 @@ impl ActiveModel {
episode: MikanEpisodeMeta,
) -> RecorderResult<Self> {
let mikan_base_url = ctx.mikan().base_url().clone();
let episode_extention_meta = extract_episode_meta_from_origin_name(&episode.episode_title)
let episode_extention_meta = OriginNameRoot::parse_comp(&episode.episode_title)
.inspect_err(|err| {
tracing::error!(
err = ?err,
@ -132,6 +132,7 @@ impl ActiveModel {
"Failed to parse episode extension meta from episode title, skip"
);
})
.map(|(_, e)| e.into_meta())
.ok();
let homepage = build_mikan_episode_homepage_url(mikan_base_url, &episode.mikan_episode_id);

View File

@ -1,9 +1,10 @@
use tracing::Level;
use tracing_subscriber::EnvFilter;
use tracing_subscriber::{EnvFilter, layer::SubscriberExt, util::SubscriberInitExt};
use tracing_tree::HierarchicalLayer;
use crate::logger::MODULE_WHITELIST;
pub fn try_init_testing_tracing(level: Level) {
fn build_testing_tracing_filter(level: Level) -> EnvFilter {
let crate_name = env!("CARGO_PKG_NAME");
let level = level.as_str().to_lowercase();
let mut filter = EnvFilter::new(format!("{crate_name}[]={level}"));
@ -14,5 +15,22 @@ pub fn try_init_testing_tracing(level: Level) {
filter = filter.add_directive(format!("{module}[]={level}").parse().unwrap());
}
let _ = tracing_subscriber::fmt().with_env_filter(filter).try_init();
filter
}
pub fn try_init_testing_tracing(level: Level) {
let _ = tracing_subscriber::fmt()
.with_env_filter(build_testing_tracing_filter(level))
.try_init();
}
pub fn try_init_testing_tracing_only_leaf(level: Level) {
let _ = tracing_subscriber::registry()
.with(build_testing_tracing_filter(level))
.with(
HierarchicalLayer::new(2)
.with_targets(true)
.with_bracketed_fields(true),
)
.try_init();
}

View File

@ -1,2 +1,3 @@
pub mod http;
pub mod json;
pub mod nom;

View File

@ -0,0 +1,261 @@
use std::collections::HashMap;
use icu::properties::{CodePointMapData, props::Script};
use lazy_static::lazy_static;
use maplit::hashmap;
use nom::{
IResult, Parser,
branch::alt,
bytes::complete::tag,
character::complete::{anychar, digit1, none_of, satisfy},
combinator::{map, opt, recognize, value, verify},
error::ParseError,
multi::many1,
sequence::{delimited, preceded},
};
use num_traits::{PrimInt, Signed};
lazy_static! {
pub static ref ZH_DIGIT_MAP: HashMap<char, u32> = {
hashmap! {
'' => 0,
'零' => 0,
'一' => 1,
'壹' => 1,
'二' => 2,
'贰' => 2,
'三' => 3,
'叁' => 3,
'四' => 4,
'肆' => 4,
'五' => 5,
'伍' => 5,
'六' => 6,
'陆' => 6,
'七' => 7,
'柒' => 7,
'八' => 8,
'捌' => 8,
'九' => 9,
'玖' => 9,
'十' => 10,
'拾' => 10,
'廿' => 20,
'念' => 20,
'百' => 100,
'佰' => 100,
'千' => 1000,
'仟' => 1000,
'万' => 10000,
'萬' => 10000,
'亿' => 100000000,
'億' => 100000000,
}
};
}
pub fn with_recognized<'a, F, O, E>(
mut parser: F,
) -> impl FnMut(&'a str) -> IResult<&'a str, (O, &'a str), E>
where
F: Parser<&'a str, Output = O, Error = E>,
E: ParseError<&'a str>,
{
move |input: &'a str| {
let i = input;
let (rest, output) = parser.parse(i)?;
let consumed_len = i.len() - rest.len();
Ok((rest, (output, &i[..consumed_len])))
}
}
pub fn is_some_unicode_scx(input: &str, script: Script) -> IResult<&str, char> {
let script_data = CodePointMapData::<Script>::new();
verify(anychar, |&c| script_data.get(c) == script).parse(input)
}
pub fn is_han_scx(input: &str) -> IResult<&str, char> {
is_some_unicode_scx(input, Script::Han)
}
pub fn is_hira_scx(input: &str) -> IResult<&str, char> {
is_some_unicode_scx(input, Script::Hiragana)
}
pub fn is_kana_scx(input: &str) -> IResult<&str, char> {
is_some_unicode_scx(input, Script::Katakana)
}
pub fn delimited_by_brackets(input: &str) -> IResult<&str, &str> {
alt((
delimited(tag("["), recognize(many1(none_of("[]"))), tag("]")),
delimited(tag(""), recognize(many1(none_of("【】"))), tag("")),
))
.parse(input)
}
pub struct ZhNum {
pub int: i32,
}
impl ZhNum {
fn parse_digit<'a>(
max_value: u32,
) -> impl Parser<&'a str, Output = u32, Error = nom::error::Error<&'a str>> {
map(
satisfy(move |c| ZH_DIGIT_MAP.get(&c).is_some_and(|v| *v <= max_value)),
|c| *ZH_DIGIT_MAP.get(&c).unwrap(),
)
}
fn parse_个(input: &str) -> IResult<&str, u32> {
Self::parse_digit(9).parse(input)
}
fn parse_十(input: &str) -> IResult<&str, u32> {
let (input, (p, o, s)) = (
opt(Self::parse_个),
map(
satisfy(|c| ZH_DIGIT_MAP.get(&c).is_some_and(|v| *v == 10 || *v == 20)),
|c| *ZH_DIGIT_MAP.get(&c).unwrap(),
),
opt(Self::parse_个),
)
.parse(input)?;
let value = p.unwrap_or(1) * o + s.unwrap_or(0);
Ok((input, value))
}
pub fn parse_百(input: &str) -> IResult<&str, u32> {
let (input, (p, o, s)) = (
opt(Self::parse_个),
map(
satisfy(|c| ZH_DIGIT_MAP.get(&c).is_some_and(|v| *v == 100 || *v == 200)),
|c| *ZH_DIGIT_MAP.get(&c).unwrap(),
),
opt(Self::parse_十),
)
.parse(input)?;
let value = p.unwrap_or(1) * o + s.unwrap_or(0);
Ok((input, value))
}
pub fn parse_千(input: &str) -> IResult<&str, u32> {
let (input, (p, o, s)) = (
opt(Self::parse_个),
value(
1000u32,
satisfy(|c| ZH_DIGIT_MAP.get(&c).is_some_and(|v| *v == 1000)),
),
opt(Self::parse_百),
)
.parse(input)?;
let value = p.unwrap_or(1) * o + s.unwrap_or(0);
Ok((input, value))
}
pub fn parse_万(input: &str) -> IResult<&str, u32> {
let (input, (p, o, s)) = (
opt(Self::parse_千),
value(
10000u32,
satisfy(|c| ZH_DIGIT_MAP.get(&c).is_some_and(|v| *v == 10000)),
),
opt(Self::parse_千),
)
.parse(input)?;
let value = p.unwrap_or(1) * o + s.unwrap_or(0);
Ok((input, value))
}
pub fn parse_亿(input: &str) -> IResult<&str, u32> {
let (input, (p, o, s)) = (
opt(Self::parse_万),
value(
100000000u32,
satisfy(|c| ZH_DIGIT_MAP.get(&c).is_some_and(|v| *v == 100000000)),
),
opt(Self::parse_万),
)
.parse(input)?;
let value = p.unwrap_or(1) * o + s.unwrap_or(0);
Ok((input, value))
}
pub fn parse_uint(input: &str) -> IResult<&str, u32> {
preceded(
opt(tag("")),
alt((
Self::parse_个,
Self::parse_十,
Self::parse_百,
Self::parse_千,
Self::parse_万,
Self::parse_亿,
)),
)
.parse(input)
}
pub fn parse_int(input: &str) -> IResult<&str, i32> {
let (input, (sign, value)) = (
opt(alt((value(1, tag("")), value(-1, tag(""))))),
alt((
Self::parse_个,
Self::parse_十,
Self::parse_百,
Self::parse_千,
Self::parse_万,
Self::parse_亿,
)),
)
.parse(input)?;
Ok((input, sign.unwrap_or(1) * value as i32))
}
}
pub fn parse_uint<T: PrimInt>(input: &str) -> IResult<&str, T> {
let (input, value) = preceded(opt(tag("+")), digit1).parse(input)?;
let value = T::from_str_radix(value, 10).map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
Ok((input, value))
}
pub fn parse_int<T: PrimInt + Signed>(input: &str) -> IResult<&str, T> {
let (input, value) = recognize((
opt(alt((
value(T::one(), tag("+")),
value(T::one().neg(), tag("-")),
))),
digit1,
))
.parse(input)?;
let value = T::from_str_radix(value, 10).map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
Ok((input, value))
}
pub fn parse_month_num(input: &str) -> IResult<&str, u32> {
verify(alt((ZhNum::parse_uint, parse_uint::<u32>)), |v| {
*v <= 12 && *v > 0
})
.parse(input)
}