feat: add new test resource mikan classic episodes tiny.parquet
This commit is contained in:
		
							parent
							
								
									f055011b86
								
							
						
					
					
						commit
						cde3361458
					
				
							
								
								
									
										2
									
								
								apps/recorder/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								apps/recorder/.gitignore
									
									
									
									
										vendored
									
									
								
							@ -27,3 +27,5 @@ node_modules
 | 
				
			|||||||
dist/
 | 
					dist/
 | 
				
			||||||
temp/*
 | 
					temp/*
 | 
				
			||||||
!temp/.gitkeep
 | 
					!temp/.gitkeep
 | 
				
			||||||
 | 
					tests/resources/mikan/classic_episodes/*/*
 | 
				
			||||||
 | 
					!tests/resources/mikan/classic_episodes/parquet/tiny.parquet
 | 
				
			||||||
@ -154,7 +154,11 @@ icu = "2.0.0"
 | 
				
			|||||||
tracing-tree = "0.4.0"
 | 
					tracing-tree = "0.4.0"
 | 
				
			||||||
num_cpus = "1.17.0"
 | 
					num_cpus = "1.17.0"
 | 
				
			||||||
headers-accept = "0.1.4"
 | 
					headers-accept = "0.1.4"
 | 
				
			||||||
polars = { version = "0.49.1", features = ["parquet"], optional = true }
 | 
					polars = { version = "0.49.1", features = [
 | 
				
			||||||
 | 
					    "parquet",
 | 
				
			||||||
 | 
					    "lazy",
 | 
				
			||||||
 | 
					    "diagonal_concat",
 | 
				
			||||||
 | 
					], optional = true }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[dev-dependencies]
 | 
					[dev-dependencies]
 | 
				
			||||||
inquire = { workspace = true }
 | 
					inquire = { workspace = true }
 | 
				
			||||||
 | 
				
			|||||||
@ -2,6 +2,7 @@ use std::collections::HashSet;
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
use chrono::{DateTime, Duration, FixedOffset, NaiveDate, NaiveTime, TimeZone, Utc};
 | 
					use chrono::{DateTime, Duration, FixedOffset, NaiveDate, NaiveTime, TimeZone, Utc};
 | 
				
			||||||
use fetch::{HttpClientConfig, fetch_html};
 | 
					use fetch::{HttpClientConfig, fetch_html};
 | 
				
			||||||
 | 
					use itertools::Itertools;
 | 
				
			||||||
use lazy_static::lazy_static;
 | 
					use lazy_static::lazy_static;
 | 
				
			||||||
use nom::{
 | 
					use nom::{
 | 
				
			||||||
    IResult, Parser,
 | 
					    IResult, Parser,
 | 
				
			||||||
@ -398,6 +399,136 @@ async fn scrape_mikan_classic_episode_table_page_from_rev_id(
 | 
				
			|||||||
    scrape_mikan_classic_episode_table_page(mikan_client, page, Some((rev_idx, total))).await
 | 
					    scrape_mikan_classic_episode_table_page(mikan_client, page, Some((rev_idx, total))).await
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					async fn merge_mikan_classic_episodes_and_strip_columns() -> RecorderResult<()> {
 | 
				
			||||||
 | 
					    use polars::prelude::*;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let dir = TEST_FOLDER.join("parquet");
 | 
				
			||||||
 | 
					    let files = std::fs::read_dir(dir)?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let parquet_paths = files
 | 
				
			||||||
 | 
					        .filter_map(|f| f.ok())
 | 
				
			||||||
 | 
					        .filter_map(|f| {
 | 
				
			||||||
 | 
					            let path = f.path();
 | 
				
			||||||
 | 
					            if let Some(ext) = path.extension()
 | 
				
			||||||
 | 
					                && ext == "parquet"
 | 
				
			||||||
 | 
					                && path
 | 
				
			||||||
 | 
					                    .file_stem()
 | 
				
			||||||
 | 
					                    .is_some_and(|f| f.to_string_lossy().starts_with("rev_"))
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                Some(path)
 | 
				
			||||||
 | 
					            } else {
 | 
				
			||||||
 | 
					                None
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        })
 | 
				
			||||||
 | 
					        .collect::<Vec<_>>();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if parquet_paths.is_empty() {
 | 
				
			||||||
 | 
					        return Err(RecorderError::without_source(
 | 
				
			||||||
 | 
					            "No parquet files found to merge".into(),
 | 
				
			||||||
 | 
					        ));
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    println!("Found {} parquet files to merge", parquet_paths.len());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // 读取并合并所有 parquet 文件
 | 
				
			||||||
 | 
					    let mut all_dfs = Vec::new();
 | 
				
			||||||
 | 
					    for path in &parquet_paths {
 | 
				
			||||||
 | 
					        println!("Reading {path:?}");
 | 
				
			||||||
 | 
					        let file = std::fs::File::open(path)?;
 | 
				
			||||||
 | 
					        let df = ParquetReader::new(file).finish().map_err(|e| {
 | 
				
			||||||
 | 
					            let message = format!("Failed to read parquet file {path:?}: {e}");
 | 
				
			||||||
 | 
					            RecorderError::with_source(Box::new(e), message)
 | 
				
			||||||
 | 
					        })?;
 | 
				
			||||||
 | 
					        all_dfs.push(df);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let lazy_frames: Vec<LazyFrame> = all_dfs.into_iter().map(|df| df.lazy()).collect();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let merged_df = concat_lf_diagonal(&lazy_frames, UnionArgs::default())
 | 
				
			||||||
 | 
					        .map_err(|e| {
 | 
				
			||||||
 | 
					            let message = format!("Failed to concat DataFrames: {e}");
 | 
				
			||||||
 | 
					            RecorderError::with_source(Box::new(e), message)
 | 
				
			||||||
 | 
					        })?
 | 
				
			||||||
 | 
					        .sort(
 | 
				
			||||||
 | 
					            ["publish_at_timestamp"],
 | 
				
			||||||
 | 
					            SortMultipleOptions::default().with_order_descending(true),
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        .unique(
 | 
				
			||||||
 | 
					            Some(vec![
 | 
				
			||||||
 | 
					                "mikan_fansub_id".to_string(),
 | 
				
			||||||
 | 
					                "mikan_episode_id".to_string(),
 | 
				
			||||||
 | 
					            ]),
 | 
				
			||||||
 | 
					            UniqueKeepStrategy::First,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        .collect()
 | 
				
			||||||
 | 
					        .map_err(|e| {
 | 
				
			||||||
 | 
					            let message = format!("Failed to collect lazy DataFrame: {e}");
 | 
				
			||||||
 | 
					            RecorderError::with_source(Box::new(e), message)
 | 
				
			||||||
 | 
					        })?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    fn select_columns_and_write(
 | 
				
			||||||
 | 
					        merged_df: DataFrame,
 | 
				
			||||||
 | 
					        name: &str,
 | 
				
			||||||
 | 
					        columns: &[&str],
 | 
				
			||||||
 | 
					    ) -> RecorderResult<()> {
 | 
				
			||||||
 | 
					        let result_df = merged_df
 | 
				
			||||||
 | 
					            .lazy()
 | 
				
			||||||
 | 
					            .sort(["publish_at_timestamp"], SortMultipleOptions::default())
 | 
				
			||||||
 | 
					            .select(columns.iter().map(|c| col(*c)).collect_vec())
 | 
				
			||||||
 | 
					            .collect()
 | 
				
			||||||
 | 
					            .map_err(|e| {
 | 
				
			||||||
 | 
					                let message = format!("Failed to sort and select columns: {e}");
 | 
				
			||||||
 | 
					                RecorderError::with_source(Box::new(e), message)
 | 
				
			||||||
 | 
					            })?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let output_path = TEST_FOLDER.join(format!("parquet/{name}.parquet"));
 | 
				
			||||||
 | 
					        let mut output_file = std::fs::File::create(&output_path)?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ParquetWriter::new(&mut output_file)
 | 
				
			||||||
 | 
					            .set_parallel(true)
 | 
				
			||||||
 | 
					            .with_compression(ParquetCompression::Zstd(Some(
 | 
				
			||||||
 | 
					                ZstdLevel::try_new(22).unwrap(),
 | 
				
			||||||
 | 
					            )))
 | 
				
			||||||
 | 
					            .finish(&mut result_df.clone())
 | 
				
			||||||
 | 
					            .map_err(|e| {
 | 
				
			||||||
 | 
					                let message = format!("Failed to write merged parquet file: {e}");
 | 
				
			||||||
 | 
					                RecorderError::with_source(Box::new(e), message)
 | 
				
			||||||
 | 
					            })?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        println!("Merged {} rows into {output_path:?}", result_df.height());
 | 
				
			||||||
 | 
					        Ok(())
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    select_columns_and_write(merged_df.clone(), "tiny", &["fansub_name", "original_name"])?;
 | 
				
			||||||
 | 
					    select_columns_and_write(
 | 
				
			||||||
 | 
					        merged_df.clone(),
 | 
				
			||||||
 | 
					        "lite",
 | 
				
			||||||
 | 
					        &[
 | 
				
			||||||
 | 
					            "mikan_fansub_id",
 | 
				
			||||||
 | 
					            "fansub_name",
 | 
				
			||||||
 | 
					            "mikan_episode_id",
 | 
				
			||||||
 | 
					            "original_name",
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					    )?;
 | 
				
			||||||
 | 
					    select_columns_and_write(
 | 
				
			||||||
 | 
					        merged_df,
 | 
				
			||||||
 | 
					        "full",
 | 
				
			||||||
 | 
					        &[
 | 
				
			||||||
 | 
					            "id",
 | 
				
			||||||
 | 
					            "publish_at_timestamp",
 | 
				
			||||||
 | 
					            "mikan_fansub_id",
 | 
				
			||||||
 | 
					            "fansub_name",
 | 
				
			||||||
 | 
					            "mikan_episode_id",
 | 
				
			||||||
 | 
					            "original_name",
 | 
				
			||||||
 | 
					            "magnet_link",
 | 
				
			||||||
 | 
					            "file_size",
 | 
				
			||||||
 | 
					            "torrent_link",
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					    )?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Ok(())
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[tokio::main]
 | 
					#[tokio::main]
 | 
				
			||||||
async fn main() -> RecorderResult<()> {
 | 
					async fn main() -> RecorderResult<()> {
 | 
				
			||||||
    std::fs::create_dir_all(TEST_FOLDER.join("html"))?;
 | 
					    std::fs::create_dir_all(TEST_FOLDER.join("html"))?;
 | 
				
			||||||
@ -442,5 +573,12 @@ async fn main() -> RecorderResult<()> {
 | 
				
			|||||||
        page.save_to_files()?;
 | 
					        page.save_to_files()?;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // 合并所有 parquet 文件
 | 
				
			||||||
 | 
					    println!("\nMerging all parquet files...");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    merge_mikan_classic_episodes_and_strip_columns().await?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    println!("Merge completed!");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Ok(())
 | 
					    Ok(())
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
										
											Binary file not shown.
										
									
								
							
		Loading…
	
		Reference in New Issue
	
	Block a user