feat: add new test resource mikan classic episodes tiny.parquet
This commit is contained in:
parent
f055011b86
commit
cde3361458
2
apps/recorder/.gitignore
vendored
2
apps/recorder/.gitignore
vendored
@ -27,3 +27,5 @@ node_modules
|
||||
dist/
|
||||
temp/*
|
||||
!temp/.gitkeep
|
||||
tests/resources/mikan/classic_episodes/*/*
|
||||
!tests/resources/mikan/classic_episodes/parquet/tiny.parquet
|
@ -154,7 +154,11 @@ icu = "2.0.0"
|
||||
tracing-tree = "0.4.0"
|
||||
num_cpus = "1.17.0"
|
||||
headers-accept = "0.1.4"
|
||||
polars = { version = "0.49.1", features = ["parquet"], optional = true }
|
||||
polars = { version = "0.49.1", features = [
|
||||
"parquet",
|
||||
"lazy",
|
||||
"diagonal_concat",
|
||||
], optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
inquire = { workspace = true }
|
||||
|
@ -2,6 +2,7 @@ use std::collections::HashSet;
|
||||
|
||||
use chrono::{DateTime, Duration, FixedOffset, NaiveDate, NaiveTime, TimeZone, Utc};
|
||||
use fetch::{HttpClientConfig, fetch_html};
|
||||
use itertools::Itertools;
|
||||
use lazy_static::lazy_static;
|
||||
use nom::{
|
||||
IResult, Parser,
|
||||
@ -398,6 +399,136 @@ async fn scrape_mikan_classic_episode_table_page_from_rev_id(
|
||||
scrape_mikan_classic_episode_table_page(mikan_client, page, Some((rev_idx, total))).await
|
||||
}
|
||||
|
||||
async fn merge_mikan_classic_episodes_and_strip_columns() -> RecorderResult<()> {
|
||||
use polars::prelude::*;
|
||||
|
||||
let dir = TEST_FOLDER.join("parquet");
|
||||
let files = std::fs::read_dir(dir)?;
|
||||
|
||||
let parquet_paths = files
|
||||
.filter_map(|f| f.ok())
|
||||
.filter_map(|f| {
|
||||
let path = f.path();
|
||||
if let Some(ext) = path.extension()
|
||||
&& ext == "parquet"
|
||||
&& path
|
||||
.file_stem()
|
||||
.is_some_and(|f| f.to_string_lossy().starts_with("rev_"))
|
||||
{
|
||||
Some(path)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if parquet_paths.is_empty() {
|
||||
return Err(RecorderError::without_source(
|
||||
"No parquet files found to merge".into(),
|
||||
));
|
||||
}
|
||||
|
||||
println!("Found {} parquet files to merge", parquet_paths.len());
|
||||
|
||||
// 读取并合并所有 parquet 文件
|
||||
let mut all_dfs = Vec::new();
|
||||
for path in &parquet_paths {
|
||||
println!("Reading {path:?}");
|
||||
let file = std::fs::File::open(path)?;
|
||||
let df = ParquetReader::new(file).finish().map_err(|e| {
|
||||
let message = format!("Failed to read parquet file {path:?}: {e}");
|
||||
RecorderError::with_source(Box::new(e), message)
|
||||
})?;
|
||||
all_dfs.push(df);
|
||||
}
|
||||
|
||||
let lazy_frames: Vec<LazyFrame> = all_dfs.into_iter().map(|df| df.lazy()).collect();
|
||||
|
||||
let merged_df = concat_lf_diagonal(&lazy_frames, UnionArgs::default())
|
||||
.map_err(|e| {
|
||||
let message = format!("Failed to concat DataFrames: {e}");
|
||||
RecorderError::with_source(Box::new(e), message)
|
||||
})?
|
||||
.sort(
|
||||
["publish_at_timestamp"],
|
||||
SortMultipleOptions::default().with_order_descending(true),
|
||||
)
|
||||
.unique(
|
||||
Some(vec![
|
||||
"mikan_fansub_id".to_string(),
|
||||
"mikan_episode_id".to_string(),
|
||||
]),
|
||||
UniqueKeepStrategy::First,
|
||||
)
|
||||
.collect()
|
||||
.map_err(|e| {
|
||||
let message = format!("Failed to collect lazy DataFrame: {e}");
|
||||
RecorderError::with_source(Box::new(e), message)
|
||||
})?;
|
||||
|
||||
fn select_columns_and_write(
|
||||
merged_df: DataFrame,
|
||||
name: &str,
|
||||
columns: &[&str],
|
||||
) -> RecorderResult<()> {
|
||||
let result_df = merged_df
|
||||
.lazy()
|
||||
.sort(["publish_at_timestamp"], SortMultipleOptions::default())
|
||||
.select(columns.iter().map(|c| col(*c)).collect_vec())
|
||||
.collect()
|
||||
.map_err(|e| {
|
||||
let message = format!("Failed to sort and select columns: {e}");
|
||||
RecorderError::with_source(Box::new(e), message)
|
||||
})?;
|
||||
|
||||
let output_path = TEST_FOLDER.join(format!("parquet/{name}.parquet"));
|
||||
let mut output_file = std::fs::File::create(&output_path)?;
|
||||
|
||||
ParquetWriter::new(&mut output_file)
|
||||
.set_parallel(true)
|
||||
.with_compression(ParquetCompression::Zstd(Some(
|
||||
ZstdLevel::try_new(22).unwrap(),
|
||||
)))
|
||||
.finish(&mut result_df.clone())
|
||||
.map_err(|e| {
|
||||
let message = format!("Failed to write merged parquet file: {e}");
|
||||
RecorderError::with_source(Box::new(e), message)
|
||||
})?;
|
||||
|
||||
println!("Merged {} rows into {output_path:?}", result_df.height());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
select_columns_and_write(merged_df.clone(), "tiny", &["fansub_name", "original_name"])?;
|
||||
select_columns_and_write(
|
||||
merged_df.clone(),
|
||||
"lite",
|
||||
&[
|
||||
"mikan_fansub_id",
|
||||
"fansub_name",
|
||||
"mikan_episode_id",
|
||||
"original_name",
|
||||
],
|
||||
)?;
|
||||
select_columns_and_write(
|
||||
merged_df,
|
||||
"full",
|
||||
&[
|
||||
"id",
|
||||
"publish_at_timestamp",
|
||||
"mikan_fansub_id",
|
||||
"fansub_name",
|
||||
"mikan_episode_id",
|
||||
"original_name",
|
||||
"magnet_link",
|
||||
"file_size",
|
||||
"torrent_link",
|
||||
],
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> RecorderResult<()> {
|
||||
std::fs::create_dir_all(TEST_FOLDER.join("html"))?;
|
||||
@ -442,5 +573,12 @@ async fn main() -> RecorderResult<()> {
|
||||
page.save_to_files()?;
|
||||
}
|
||||
|
||||
// 合并所有 parquet 文件
|
||||
println!("\nMerging all parquet files...");
|
||||
|
||||
merge_mikan_classic_episodes_and_strip_columns().await?;
|
||||
|
||||
println!("Merge completed!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user