feat: classic episodes scraper
This commit is contained in:
parent
28dd9da6ac
commit
fe0b7e88e6
995
Cargo.lock
generated
995
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -2,8 +2,20 @@
|
|||||||
name = "recorder"
|
name = "recorder"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = ["jxl"]
|
||||||
|
playground = ["dep:inquire", "dep:color-eyre", "dep:polars"]
|
||||||
|
testcontainers = [
|
||||||
|
"dep:testcontainers",
|
||||||
|
"dep:testcontainers-modules",
|
||||||
|
"dep:testcontainers-ext",
|
||||||
|
"downloader/testcontainers",
|
||||||
|
"testcontainers-modules/postgres",
|
||||||
|
]
|
||||||
|
jxl = ["dep:jpegxl-rs", "dep:jpegxl-sys"]
|
||||||
|
|
||||||
[lib]
|
[lib]
|
||||||
name = "recorder"
|
name = "recorder"
|
||||||
path = "src/lib.rs"
|
path = "src/lib.rs"
|
||||||
@ -13,17 +25,25 @@ name = "recorder_cli"
|
|||||||
path = "src/bin/main.rs"
|
path = "src/bin/main.rs"
|
||||||
required-features = []
|
required-features = []
|
||||||
|
|
||||||
[features]
|
[[example]]
|
||||||
default = ["jxl"]
|
name = "mikan_collect_classic_eps"
|
||||||
playground = ["dep:inquire", "dep:color-eyre"]
|
path = "examples/mikan_collect_classic_eps.rs"
|
||||||
testcontainers = [
|
required-features = ["playground"]
|
||||||
"dep:testcontainers",
|
|
||||||
"dep:testcontainers-modules",
|
[[example]]
|
||||||
"dep:testcontainers-ext",
|
name = "mikan_doppel_season_subscription"
|
||||||
"downloader/testcontainers",
|
path = "examples/mikan_doppel_season_subscription.rs"
|
||||||
"testcontainers-modules/postgres",
|
required-features = ["playground"]
|
||||||
]
|
|
||||||
jxl = ["dep:jpegxl-rs", "dep:jpegxl-sys"]
|
[[example]]
|
||||||
|
name = "mikan_doppel_subscriber_subscription"
|
||||||
|
path = "examples/mikan_doppel_subscriber_subscription.rs"
|
||||||
|
required-features = ["playground"]
|
||||||
|
|
||||||
|
[[example]]
|
||||||
|
name = "playground"
|
||||||
|
path = "examples/playground.rs"
|
||||||
|
required-features = ["playground"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
downloader = { workspace = true }
|
downloader = { workspace = true }
|
||||||
@ -93,7 +113,7 @@ fancy-regex = "0.14"
|
|||||||
lightningcss = "1.0.0-alpha.66"
|
lightningcss = "1.0.0-alpha.66"
|
||||||
html-escape = "0.2.13"
|
html-escape = "0.2.13"
|
||||||
opendal = { version = "0.53", features = ["default", "services-fs"] }
|
opendal = { version = "0.53", features = ["default", "services-fs"] }
|
||||||
scraper = "0.23"
|
scraper = "0.23.1"
|
||||||
async-graphql = { version = "7", features = ["dynamic-schema"] }
|
async-graphql = { version = "7", features = ["dynamic-schema"] }
|
||||||
async-graphql-axum = "7"
|
async-graphql-axum = "7"
|
||||||
seaography = { version = "1.1", features = [
|
seaography = { version = "1.1", features = [
|
||||||
@ -134,11 +154,11 @@ icu = "2.0.0"
|
|||||||
tracing-tree = "0.4.0"
|
tracing-tree = "0.4.0"
|
||||||
num_cpus = "1.17.0"
|
num_cpus = "1.17.0"
|
||||||
headers-accept = "0.1.4"
|
headers-accept = "0.1.4"
|
||||||
|
polars = { version = "0.49.1", features = ["parquet"], optional = true }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
inquire = { workspace = true }
|
inquire = { workspace = true }
|
||||||
color-eyre = { workspace = true }
|
color-eyre = { workspace = true }
|
||||||
|
|
||||||
serial_test = "3"
|
serial_test = "3"
|
||||||
insta = { version = "1", features = ["redactions", "toml", "filters"] }
|
insta = { version = "1", features = ["redactions", "toml", "filters"] }
|
||||||
rstest = "0.25"
|
rstest = "0.25"
|
||||||
|
443
apps/recorder/examples/mikan_collect_classic_eps.rs
Normal file
443
apps/recorder/examples/mikan_collect_classic_eps.rs
Normal file
@ -0,0 +1,443 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
use chrono::{DateTime, Duration, FixedOffset, NaiveDate, NaiveTime, TimeZone, Utc};
|
||||||
|
use fetch::{HttpClientConfig, fetch_html};
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use nom::{
|
||||||
|
IResult, Parser,
|
||||||
|
branch::alt,
|
||||||
|
bytes::complete::{tag, take, take_till1},
|
||||||
|
character::complete::space1,
|
||||||
|
combinator::map,
|
||||||
|
};
|
||||||
|
use recorder::{
|
||||||
|
errors::{RecorderError, RecorderResult},
|
||||||
|
extract::{
|
||||||
|
html::extract_inner_text_from_element_ref,
|
||||||
|
mikan::{MikanClient, MikanConfig, MikanEpisodeHash, MikanFansubHash},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
use regex::Regex;
|
||||||
|
use scraper::{ElementRef, Html, Selector};
|
||||||
|
use snafu::FromString;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref TEST_FOLDER: std::path::PathBuf =
|
||||||
|
if cfg!(any(test, debug_assertions, feature = "playground")) {
|
||||||
|
std::path::PathBuf::from(format!(
|
||||||
|
"{}/tests/resources/mikan/classic_episodes",
|
||||||
|
env!("CARGO_MANIFEST_DIR")
|
||||||
|
))
|
||||||
|
} else {
|
||||||
|
std::path::PathBuf::from("tests/resources/mikan/classic_episodes")
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref TOTAL_PAGE_REGEX: Regex =
|
||||||
|
Regex::new(r#"\$\(\'\.classic-view-pagination2\'\)\.bootpag\(\{\s*total:\s*(\d+)"#)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct MikanClassicEpisodeTableRow {
|
||||||
|
pub id: i32,
|
||||||
|
pub publish_at: DateTime<Utc>,
|
||||||
|
pub mikan_fansub_id: Option<String>,
|
||||||
|
pub fansub_name: Option<String>,
|
||||||
|
pub mikan_episode_id: String,
|
||||||
|
pub original_name: String,
|
||||||
|
pub magnet_link: Option<String>,
|
||||||
|
pub file_size: Option<String>,
|
||||||
|
pub torrent_link: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MikanClassicEpisodeTableRow {
|
||||||
|
fn timezone() -> FixedOffset {
|
||||||
|
FixedOffset::east_opt(8 * 3600).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fixed_date_parser(input: &str) -> IResult<&str, NaiveDate> {
|
||||||
|
alt((
|
||||||
|
map(tag("今天"), move |_| {
|
||||||
|
Utc::now().with_timezone(&Self::timezone()).date_naive()
|
||||||
|
}),
|
||||||
|
map(tag("昨天"), move |_| {
|
||||||
|
Utc::now().with_timezone(&Self::timezone()).date_naive() - Duration::days(1)
|
||||||
|
}),
|
||||||
|
))
|
||||||
|
.parse(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn formatted_date_parser(input: &str) -> IResult<&str, NaiveDate> {
|
||||||
|
let (remain, date_str) = take_till1(|c: char| c.is_whitespace()).parse(input)?;
|
||||||
|
let date = NaiveDate::parse_from_str(date_str, "%Y/%m/%d").map_err(|_| {
|
||||||
|
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Verify))
|
||||||
|
})?;
|
||||||
|
Ok((remain, date))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn date_parser(input: &str) -> IResult<&str, NaiveDate> {
|
||||||
|
alt((Self::fixed_date_parser, Self::formatted_date_parser)).parse(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn time_parser(input: &str) -> IResult<&str, NaiveTime> {
|
||||||
|
let (remain, time_str) = take(5usize).parse(input)?;
|
||||||
|
let time = NaiveTime::parse_from_str(time_str, "%H:%M").map_err(|_| {
|
||||||
|
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Verify))
|
||||||
|
})?;
|
||||||
|
Ok((remain, time))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_publish_at(text: &str) -> Option<DateTime<Utc>> {
|
||||||
|
let (_, (date, _, time)) = (Self::date_parser, space1, Self::time_parser)
|
||||||
|
.parse(text)
|
||||||
|
.ok()?;
|
||||||
|
let local_dt = Self::timezone()
|
||||||
|
.from_local_datetime(&date.and_time(time))
|
||||||
|
.single()?;
|
||||||
|
Some(local_dt.with_timezone(&Utc))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_element_ref(
|
||||||
|
row: ElementRef<'_>,
|
||||||
|
rev_id: i32,
|
||||||
|
idx: i32,
|
||||||
|
mikan_base_url: &Url,
|
||||||
|
) -> RecorderResult<Self> {
|
||||||
|
let publish_at_selector = &Selector::parse("td:nth-of-type(1)").unwrap();
|
||||||
|
let fansub_selector = &Selector::parse("td:nth-of-type(2) > a").unwrap();
|
||||||
|
let original_name_selector =
|
||||||
|
&Selector::parse("td:nth-of-type(3) > a:nth-of-type(1)").unwrap();
|
||||||
|
let magnet_link_selector =
|
||||||
|
&Selector::parse("td:nth-of-type(3) > a:nth-of-type(2)").unwrap();
|
||||||
|
let file_size_selector = &Selector::parse("td:nth-of-type(4)").unwrap();
|
||||||
|
let torrent_link_selector = &Selector::parse("td:nth-of-type(5) > a").unwrap();
|
||||||
|
|
||||||
|
let publish_at = row
|
||||||
|
.select(publish_at_selector)
|
||||||
|
.next()
|
||||||
|
.map(extract_inner_text_from_element_ref)
|
||||||
|
.and_then(|e| Self::extract_publish_at(&e));
|
||||||
|
|
||||||
|
let (mikan_fansub_hash, fansub_name) = row
|
||||||
|
.select(fansub_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|e| {
|
||||||
|
e.attr("href")
|
||||||
|
.and_then(|s| mikan_base_url.join(s).ok())
|
||||||
|
.and_then(|u| MikanFansubHash::from_homepage_url(&u))
|
||||||
|
.map(|h| (h, extract_inner_text_from_element_ref(e)))
|
||||||
|
})
|
||||||
|
.unzip();
|
||||||
|
|
||||||
|
let (mikan_episode_hash, original_name) = row
|
||||||
|
.select(original_name_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|el| {
|
||||||
|
el.attr("href")
|
||||||
|
.and_then(|s| mikan_base_url.join(s).ok())
|
||||||
|
.and_then(|u| MikanEpisodeHash::from_homepage_url(&u))
|
||||||
|
.map(|h| (h, extract_inner_text_from_element_ref(el)))
|
||||||
|
})
|
||||||
|
.unzip();
|
||||||
|
|
||||||
|
let magnet_link = row
|
||||||
|
.select(magnet_link_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|el| el.attr("data-clipboard-text"));
|
||||||
|
|
||||||
|
let file_size = row
|
||||||
|
.select(file_size_selector)
|
||||||
|
.next()
|
||||||
|
.map(extract_inner_text_from_element_ref);
|
||||||
|
|
||||||
|
let torrent_link = row
|
||||||
|
.select(torrent_link_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|el| el.attr("href"));
|
||||||
|
|
||||||
|
if let (Some(mikan_episode_hash), Some(original_name), Some(publish_at)) = (
|
||||||
|
mikan_episode_hash.as_ref(),
|
||||||
|
original_name.as_ref(),
|
||||||
|
publish_at.as_ref(),
|
||||||
|
) {
|
||||||
|
Ok(Self {
|
||||||
|
id: rev_id * 1000 + idx,
|
||||||
|
publish_at: *publish_at,
|
||||||
|
mikan_fansub_id: mikan_fansub_hash.map(|h| h.mikan_fansub_id.clone()),
|
||||||
|
fansub_name,
|
||||||
|
mikan_episode_id: mikan_episode_hash.mikan_episode_id.clone(),
|
||||||
|
original_name: original_name.clone(),
|
||||||
|
magnet_link: magnet_link.map(|s| s.to_string()),
|
||||||
|
file_size: file_size.map(|s| s.to_string()),
|
||||||
|
torrent_link: torrent_link.map(|s| s.to_string()),
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
let mut missing_fields = vec![];
|
||||||
|
if mikan_episode_hash.is_none() {
|
||||||
|
missing_fields.push("mikan_episode_id");
|
||||||
|
}
|
||||||
|
if original_name.is_none() {
|
||||||
|
missing_fields.push("original_name");
|
||||||
|
}
|
||||||
|
if publish_at.is_none() {
|
||||||
|
missing_fields.push("publish_at");
|
||||||
|
}
|
||||||
|
Err(RecorderError::without_source(format!(
|
||||||
|
"Failed to parse episode table row, missing fields: {missing_fields:?}, row \
|
||||||
|
index: {idx}"
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct MikanClassicEpisodeTablePage {
|
||||||
|
pub page: i32,
|
||||||
|
pub total: i32,
|
||||||
|
pub html: String,
|
||||||
|
pub rows: Vec<MikanClassicEpisodeTableRow>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MikanClassicEpisodeTablePage {
|
||||||
|
pub fn from_html(
|
||||||
|
html: String,
|
||||||
|
mikan_base_url: &Url,
|
||||||
|
page: i32,
|
||||||
|
updated_info: Option<(i32, i32)>,
|
||||||
|
) -> RecorderResult<Self> {
|
||||||
|
let tr_selector = &Selector::parse("tbody tr").unwrap();
|
||||||
|
let doc = Html::parse_document(&html);
|
||||||
|
if let Some(mut total) = TOTAL_PAGE_REGEX
|
||||||
|
.captures(&html)
|
||||||
|
.and_then(|c| c.get(1))
|
||||||
|
.and_then(|s| s.as_str().parse::<i32>().ok())
|
||||||
|
{
|
||||||
|
if let Some((_, update_total)) = updated_info {
|
||||||
|
total = update_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
let rev_id = total - page;
|
||||||
|
let rows = doc
|
||||||
|
.select(tr_selector)
|
||||||
|
.rev()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(idx, tr)| {
|
||||||
|
MikanClassicEpisodeTableRow::from_element_ref(
|
||||||
|
tr,
|
||||||
|
rev_id,
|
||||||
|
idx as i32,
|
||||||
|
mikan_base_url,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect::<RecorderResult<Vec<_>>>()?;
|
||||||
|
Ok(Self {
|
||||||
|
page,
|
||||||
|
total,
|
||||||
|
html,
|
||||||
|
rows,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
Err(RecorderError::without_source(
|
||||||
|
"Failed to parse pagination meta and rows".into(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn save_to_files(&self) -> RecorderResult<()> {
|
||||||
|
use polars::prelude::*;
|
||||||
|
|
||||||
|
let rev_id = self.total - self.page;
|
||||||
|
let parquet_path = TEST_FOLDER.join(format!("parquet/rev_{rev_id}.parquet"));
|
||||||
|
let csv_path = TEST_FOLDER.join(format!("csv/rev_{rev_id}.csv"));
|
||||||
|
let html_path = TEST_FOLDER.join(format!("html/rev_{rev_id}.html"));
|
||||||
|
|
||||||
|
std::fs::write(html_path, self.html.clone())?;
|
||||||
|
|
||||||
|
let mut publish_at_vec = Vec::new();
|
||||||
|
let mut mikan_fansub_id_vec = Vec::new();
|
||||||
|
let mut fansub_name_vec = Vec::new();
|
||||||
|
let mut mikan_episode_id_vec = Vec::new();
|
||||||
|
let mut original_name_vec = Vec::new();
|
||||||
|
let mut magnet_link_vec = Vec::new();
|
||||||
|
let mut file_size_vec = Vec::new();
|
||||||
|
let mut torrent_link_vec = Vec::new();
|
||||||
|
|
||||||
|
for row in &self.rows {
|
||||||
|
publish_at_vec.push(row.publish_at.to_rfc3339());
|
||||||
|
mikan_fansub_id_vec.push(row.mikan_fansub_id.clone());
|
||||||
|
fansub_name_vec.push(row.fansub_name.clone());
|
||||||
|
mikan_episode_id_vec.push(row.mikan_episode_id.clone());
|
||||||
|
original_name_vec.push(row.original_name.clone());
|
||||||
|
magnet_link_vec.push(row.magnet_link.clone());
|
||||||
|
file_size_vec.push(row.file_size.clone());
|
||||||
|
torrent_link_vec.push(row.torrent_link.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
let df = df! [
|
||||||
|
"publish_at_timestamp" => publish_at_vec,
|
||||||
|
"mikan_fansub_id" => mikan_fansub_id_vec,
|
||||||
|
"fansub_name" => fansub_name_vec,
|
||||||
|
"mikan_episode_id" => mikan_episode_id_vec,
|
||||||
|
"original_name" => original_name_vec,
|
||||||
|
"magnet_link" => magnet_link_vec,
|
||||||
|
"file_size" => file_size_vec,
|
||||||
|
"torrent_link" => torrent_link_vec,
|
||||||
|
]
|
||||||
|
.map_err(|e| {
|
||||||
|
let message = format!("Failed to create DataFrame: {e}");
|
||||||
|
RecorderError::with_source(Box::new(e), message)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut parquet_file = std::fs::File::create(&parquet_path)?;
|
||||||
|
|
||||||
|
ParquetWriter::new(&mut parquet_file)
|
||||||
|
.finish(&mut df.clone())
|
||||||
|
.map_err(|e| {
|
||||||
|
let message = format!("Failed to write parquet file: {e}");
|
||||||
|
RecorderError::with_source(Box::new(e), message)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut csv_file = std::fs::File::create(&csv_path)?;
|
||||||
|
|
||||||
|
CsvWriter::new(&mut csv_file)
|
||||||
|
.include_header(true)
|
||||||
|
.with_quote_style(QuoteStyle::Always)
|
||||||
|
.finish(&mut df.clone())
|
||||||
|
.map_err(|e| {
|
||||||
|
let message = format!("Failed to write csv file: {e}");
|
||||||
|
RecorderError::with_source(Box::new(e), message)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"[{}/{}] Saved {} rows to rev_{}.{{parquet,html,csv}}",
|
||||||
|
self.page,
|
||||||
|
self.total,
|
||||||
|
self.rows.len(),
|
||||||
|
rev_id
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn waiting_rev_ids(total: i32) -> RecorderResult<Vec<i32>> {
|
||||||
|
let dir = TEST_FOLDER.join("csv");
|
||||||
|
|
||||||
|
let files = std::fs::read_dir(dir)?;
|
||||||
|
|
||||||
|
let rev_ids = files
|
||||||
|
.filter_map(|f| f.ok())
|
||||||
|
.filter_map(|f| {
|
||||||
|
f.path().file_stem().and_then(|s| {
|
||||||
|
s.to_str().and_then(|s| {
|
||||||
|
if s.starts_with("rev_") {
|
||||||
|
s.replace("rev_", "").parse::<i32>().ok()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
|
||||||
|
Ok((0..total)
|
||||||
|
.filter(|rev_id| !rev_ids.contains(rev_id))
|
||||||
|
.collect::<Vec<_>>())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn scrape_mikan_classic_episode_table_page(
|
||||||
|
mikan_client: &MikanClient,
|
||||||
|
page: i32,
|
||||||
|
updated_info: Option<(i32, i32)>,
|
||||||
|
) -> RecorderResult<MikanClassicEpisodeTablePage> {
|
||||||
|
let mikan_base_url = mikan_client.base_url();
|
||||||
|
let url = mikan_base_url.join(&format!("/Home/Classic/{page}"))?;
|
||||||
|
|
||||||
|
if let Some((rev_id, update_total)) = updated_info.as_ref() {
|
||||||
|
let html_path = TEST_FOLDER.join(format!("html/rev_{rev_id}.html"));
|
||||||
|
if html_path.exists() {
|
||||||
|
let html = std::fs::read_to_string(&html_path)?;
|
||||||
|
println!("[{page}/{update_total}] html exists, skipping fetch");
|
||||||
|
return MikanClassicEpisodeTablePage::from_html(
|
||||||
|
html,
|
||||||
|
mikan_base_url,
|
||||||
|
page,
|
||||||
|
updated_info,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let total = if let Some((_, update_total)) = updated_info.as_ref() {
|
||||||
|
update_total.to_string()
|
||||||
|
} else {
|
||||||
|
"Unknown".to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
println!("[{page}/{total}] fetching html...");
|
||||||
|
|
||||||
|
let html = fetch_html(mikan_client, url).await?;
|
||||||
|
|
||||||
|
println!("[{page}/{total}] fetched html done");
|
||||||
|
|
||||||
|
std::fs::write(TEST_FOLDER.join("html/temp.html"), html.clone())?;
|
||||||
|
|
||||||
|
MikanClassicEpisodeTablePage::from_html(html, mikan_base_url, page, updated_info)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn scrape_mikan_classic_episode_table_page_from_rev_id(
|
||||||
|
mikan_client: &MikanClient,
|
||||||
|
total: i32,
|
||||||
|
rev_idx: i32,
|
||||||
|
) -> RecorderResult<MikanClassicEpisodeTablePage> {
|
||||||
|
let page = total - rev_idx;
|
||||||
|
|
||||||
|
scrape_mikan_classic_episode_table_page(mikan_client, page, Some((rev_idx, total))).await
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> RecorderResult<()> {
|
||||||
|
std::fs::create_dir_all(TEST_FOLDER.join("html"))?;
|
||||||
|
std::fs::create_dir_all(TEST_FOLDER.join("parquet"))?;
|
||||||
|
std::fs::create_dir_all(TEST_FOLDER.join("csv"))?;
|
||||||
|
|
||||||
|
let mikan_scrape_client = MikanClient::from_config(MikanConfig {
|
||||||
|
http_client: HttpClientConfig {
|
||||||
|
exponential_backoff_max_retries: Some(3),
|
||||||
|
leaky_bucket_max_tokens: Some(2),
|
||||||
|
leaky_bucket_initial_tokens: Some(0),
|
||||||
|
leaky_bucket_refill_tokens: Some(1),
|
||||||
|
leaky_bucket_refill_interval: Some(std::time::Duration::from_millis(1000)),
|
||||||
|
user_agent: Some(
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
|
||||||
|
Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0"
|
||||||
|
.to_string(),
|
||||||
|
),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
base_url: Url::parse("https://mikanani.me")?,
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let first_page_and_pagination_info =
|
||||||
|
scrape_mikan_classic_episode_table_page(&mikan_scrape_client, 1, None).await?;
|
||||||
|
|
||||||
|
let total_page = first_page_and_pagination_info.total;
|
||||||
|
|
||||||
|
first_page_and_pagination_info.save_to_files()?;
|
||||||
|
|
||||||
|
let next_rev_ids = MikanClassicEpisodeTablePage::waiting_rev_ids(total_page)?;
|
||||||
|
|
||||||
|
for todo_rev_id in next_rev_ids {
|
||||||
|
let page = scrape_mikan_classic_episode_table_page_from_rev_id(
|
||||||
|
&mikan_scrape_client,
|
||||||
|
total_page,
|
||||||
|
todo_rev_id,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
page.save_to_files()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
@ -12,6 +12,7 @@ pub const MIKAN_BANGUMI_POSTER_PATH: &str = "/images/Bangumi";
|
|||||||
pub const MIKAN_EPISODE_TORRENT_PATH: &str = "/Download";
|
pub const MIKAN_EPISODE_TORRENT_PATH: &str = "/Download";
|
||||||
pub const MIKAN_SUBSCRIBER_SUBSCRIPTION_RSS_PATH: &str = "/RSS/MyBangumi";
|
pub const MIKAN_SUBSCRIBER_SUBSCRIPTION_RSS_PATH: &str = "/RSS/MyBangumi";
|
||||||
pub const MIKAN_BANGUMI_RSS_PATH: &str = "/RSS/Bangumi";
|
pub const MIKAN_BANGUMI_RSS_PATH: &str = "/RSS/Bangumi";
|
||||||
|
pub const MIKAN_FANSUB_HOMEPAGE_PATH: &str = "/Home/PublishGroup";
|
||||||
pub const MIKAN_BANGUMI_ID_QUERY_KEY: &str = "bangumiId";
|
pub const MIKAN_BANGUMI_ID_QUERY_KEY: &str = "bangumiId";
|
||||||
pub const MIKAN_FANSUB_ID_QUERY_KEY: &str = "subgroupid";
|
pub const MIKAN_FANSUB_ID_QUERY_KEY: &str = "subgroupid";
|
||||||
pub const MIKAN_SUBSCRIBER_SUBSCRIPTION_TOKEN_QUERY_KEY: &str = "token";
|
pub const MIKAN_SUBSCRIBER_SUBSCRIPTION_TOKEN_QUERY_KEY: &str = "token";
|
||||||
|
@ -11,10 +11,11 @@ pub use constants::{
|
|||||||
MIKAN_ACCOUNT_MANAGE_PAGE_PATH, MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH,
|
MIKAN_ACCOUNT_MANAGE_PAGE_PATH, MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH,
|
||||||
MIKAN_BANGUMI_HOMEPAGE_PATH, MIKAN_BANGUMI_ID_QUERY_KEY, MIKAN_BANGUMI_POSTER_PATH,
|
MIKAN_BANGUMI_HOMEPAGE_PATH, MIKAN_BANGUMI_ID_QUERY_KEY, MIKAN_BANGUMI_POSTER_PATH,
|
||||||
MIKAN_BANGUMI_RSS_PATH, MIKAN_EPISODE_HOMEPAGE_PATH, MIKAN_EPISODE_TORRENT_PATH,
|
MIKAN_BANGUMI_RSS_PATH, MIKAN_EPISODE_HOMEPAGE_PATH, MIKAN_EPISODE_TORRENT_PATH,
|
||||||
MIKAN_FANSUB_ID_QUERY_KEY, MIKAN_LOGIN_PAGE_PATH, MIKAN_LOGIN_PAGE_SEARCH,
|
MIKAN_FANSUB_HOMEPAGE_PATH, MIKAN_FANSUB_ID_QUERY_KEY, MIKAN_LOGIN_PAGE_PATH,
|
||||||
MIKAN_POSTER_BUCKET_KEY, MIKAN_SEASON_FLOW_PAGE_PATH, MIKAN_SEASON_STR_QUERY_KEY,
|
MIKAN_LOGIN_PAGE_SEARCH, MIKAN_POSTER_BUCKET_KEY, MIKAN_SEASON_FLOW_PAGE_PATH,
|
||||||
MIKAN_SUBSCRIBER_SUBSCRIPTION_RSS_PATH, MIKAN_SUBSCRIBER_SUBSCRIPTION_TOKEN_QUERY_KEY,
|
MIKAN_SEASON_STR_QUERY_KEY, MIKAN_SUBSCRIBER_SUBSCRIPTION_RSS_PATH,
|
||||||
MIKAN_UNKNOWN_FANSUB_ID, MIKAN_UNKNOWN_FANSUB_NAME, MIKAN_YEAR_QUERY_KEY,
|
MIKAN_SUBSCRIBER_SUBSCRIPTION_TOKEN_QUERY_KEY, MIKAN_UNKNOWN_FANSUB_ID,
|
||||||
|
MIKAN_UNKNOWN_FANSUB_NAME, MIKAN_YEAR_QUERY_KEY,
|
||||||
};
|
};
|
||||||
pub use credential::MikanCredentialForm;
|
pub use credential::MikanCredentialForm;
|
||||||
pub use subscription::{
|
pub use subscription::{
|
||||||
@ -22,11 +23,12 @@ pub use subscription::{
|
|||||||
};
|
};
|
||||||
pub use web::{
|
pub use web::{
|
||||||
MikanBangumiHash, MikanBangumiIndexHash, MikanBangumiIndexMeta, MikanBangumiMeta,
|
MikanBangumiHash, MikanBangumiIndexHash, MikanBangumiIndexMeta, MikanBangumiMeta,
|
||||||
MikanBangumiPosterMeta, MikanEpisodeHash, MikanEpisodeMeta, MikanRssEpisodeItem,
|
MikanBangumiPosterMeta, MikanEpisodeHash, MikanEpisodeMeta, MikanFansubHash,
|
||||||
MikanSeasonFlowUrlMeta, MikanSeasonStr, MikanSubscriberSubscriptionRssUrlMeta,
|
MikanRssEpisodeItem, MikanSeasonFlowUrlMeta, MikanSeasonStr,
|
||||||
build_mikan_bangumi_expand_subscribed_url, build_mikan_bangumi_homepage_url,
|
MikanSubscriberSubscriptionRssUrlMeta, build_mikan_bangumi_expand_subscribed_url,
|
||||||
build_mikan_bangumi_subscription_rss_url, build_mikan_episode_homepage_url,
|
build_mikan_bangumi_homepage_url, build_mikan_bangumi_subscription_rss_url,
|
||||||
build_mikan_season_flow_url, build_mikan_subscriber_subscription_rss_url,
|
build_mikan_episode_homepage_url, build_mikan_season_flow_url,
|
||||||
|
build_mikan_subscriber_subscription_rss_url,
|
||||||
extract_mikan_bangumi_index_meta_list_from_season_flow_fragment,
|
extract_mikan_bangumi_index_meta_list_from_season_flow_fragment,
|
||||||
extract_mikan_bangumi_meta_from_expand_subscribed_fragment,
|
extract_mikan_bangumi_meta_from_expand_subscribed_fragment,
|
||||||
extract_mikan_episode_meta_from_episode_homepage_html,
|
extract_mikan_episode_meta_from_episode_homepage_html,
|
||||||
|
@ -22,8 +22,8 @@ use crate::{
|
|||||||
mikan::{
|
mikan::{
|
||||||
MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH, MIKAN_BANGUMI_HOMEPAGE_PATH,
|
MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH, MIKAN_BANGUMI_HOMEPAGE_PATH,
|
||||||
MIKAN_BANGUMI_ID_QUERY_KEY, MIKAN_BANGUMI_POSTER_PATH, MIKAN_BANGUMI_RSS_PATH,
|
MIKAN_BANGUMI_ID_QUERY_KEY, MIKAN_BANGUMI_POSTER_PATH, MIKAN_BANGUMI_RSS_PATH,
|
||||||
MIKAN_EPISODE_HOMEPAGE_PATH, MIKAN_FANSUB_ID_QUERY_KEY, MIKAN_POSTER_BUCKET_KEY,
|
MIKAN_EPISODE_HOMEPAGE_PATH, MIKAN_FANSUB_HOMEPAGE_PATH, MIKAN_FANSUB_ID_QUERY_KEY,
|
||||||
MIKAN_SEASON_FLOW_PAGE_PATH, MIKAN_SEASON_STR_QUERY_KEY,
|
MIKAN_POSTER_BUCKET_KEY, MIKAN_SEASON_FLOW_PAGE_PATH, MIKAN_SEASON_STR_QUERY_KEY,
|
||||||
MIKAN_SUBSCRIBER_SUBSCRIPTION_RSS_PATH, MIKAN_SUBSCRIBER_SUBSCRIPTION_TOKEN_QUERY_KEY,
|
MIKAN_SUBSCRIBER_SUBSCRIPTION_RSS_PATH, MIKAN_SUBSCRIBER_SUBSCRIPTION_TOKEN_QUERY_KEY,
|
||||||
MIKAN_YEAR_QUERY_KEY, MikanClient,
|
MIKAN_YEAR_QUERY_KEY, MikanClient,
|
||||||
},
|
},
|
||||||
@ -205,6 +205,32 @@ impl MikanBangumiMeta {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
|
||||||
|
pub struct MikanFansubHash {
|
||||||
|
pub mikan_fansub_id: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MikanFansubHash {
|
||||||
|
pub fn from_homepage_url(url: &Url) -> Option<Self> {
|
||||||
|
let path = url.path();
|
||||||
|
if path.starts_with(MIKAN_FANSUB_HOMEPAGE_PATH) {
|
||||||
|
let mikan_fansub_id = path.replace(&format!("{MIKAN_FANSUB_HOMEPAGE_PATH}/"), "");
|
||||||
|
Some(Self { mikan_fansub_id })
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build_homepage_url(self, mikan_base_url: Url) -> Url {
|
||||||
|
let mut url = mikan_base_url;
|
||||||
|
url.set_path(&format!(
|
||||||
|
"{MIKAN_FANSUB_HOMEPAGE_PATH}/{}",
|
||||||
|
self.mikan_fansub_id
|
||||||
|
));
|
||||||
|
url
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq)]
|
#[derive(Clone, Debug, PartialEq)]
|
||||||
pub struct MikanEpisodeMeta {
|
pub struct MikanEpisodeMeta {
|
||||||
pub homepage: Url,
|
pub homepage: Url,
|
||||||
|
@ -152,7 +152,10 @@ impl ActiveModel {
|
|||||||
season_raw: ActiveValue::Set(season_raw),
|
season_raw: ActiveValue::Set(season_raw),
|
||||||
fansub: ActiveValue::Set(Some(meta.fansub)),
|
fansub: ActiveValue::Set(Some(meta.fansub)),
|
||||||
poster_link: ActiveValue::Set(poster_link),
|
poster_link: ActiveValue::Set(poster_link),
|
||||||
origin_poster_link: ActiveValue::Set(meta.origin_poster_src.map(|src| src.to_string())),
|
origin_poster_link: ActiveValue::Set(
|
||||||
|
meta.origin_poster_src
|
||||||
|
.map(|src| src[url::Position::BeforePath..].to_string()),
|
||||||
|
),
|
||||||
homepage: ActiveValue::Set(Some(meta.homepage.to_string())),
|
homepage: ActiveValue::Set(Some(meta.homepage.to_string())),
|
||||||
rss_link: ActiveValue::Set(Some(rss_url.to_string())),
|
rss_link: ActiveValue::Set(Some(rss_url.to_string())),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
|
Loading…
Reference in New Issue
Block a user