diff --git a/Cargo.lock b/Cargo.lock index 5b78ded..812e607 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -275,6 +275,27 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "argminmax" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70f13d10a41ac8d2ec79ee34178d61e6f47a29c2edfe7ef1721c7383b0359e65" +dependencies = [ + "num-traits", +] + +[[package]] +name = "array-init-cursor" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed51fe0f224d1d4ea768be38c51f9f831dee9d05c163c11fba0b8c44387b1fc3" + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + [[package]] name = "arrayvec" version = "0.7.6" @@ -333,13 +354,25 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "async-channel" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89b47800b0be77592da0afd425cc03468052844aff33b84e33cc696f64e77b6a" +dependencies = [ + "concurrent-queue", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + [[package]] name = "async-compression" version = "0.4.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40f6024f3f856663b45fd0c9b6f2024034a702f453549449e0d84a305900dad4" dependencies = [ - "brotli", + "brotli 8.0.1", "flate2", "futures-core", "memchr", @@ -499,6 +532,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "atoi_simd" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a49e05797ca52e312a0c658938b7d00693ef037799ef7187678f212d7684cf" +dependencies = [ + "debug_unsafe", +] + [[package]] name = "atom_syndication" version = "0.12.7" @@ -795,6 +837,19 @@ dependencies = [ "wyz", ] +[[package]] +name = "blake3" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + [[package]] name = "block-buffer" version = "0.9.0" @@ -886,6 +941,23 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "boxcar" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26c4925bc979b677330a8c7fe7a8c94af2dbb4a2d37b4a20a80d884400f46baa" + +[[package]] +name = "brotli" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor 4.0.3", +] + [[package]] name = "brotli" version = "8.0.1" @@ -894,7 +966,17 @@ checksum = "9991eea70ea4f293524138648e41ee89b0b2b12ddef3b255effa43c8056e0e0d" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", - "brotli-decompressor", + "brotli-decompressor 5.0.0", +] + +[[package]] +name = "brotli-decompressor" +version = "4.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a334ef7c9e23abf0ce748e8cd309037da93e606ad52eb372e4ce327a0dcfbdfd" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", ] [[package]] @@ -957,6 +1039,20 @@ name = "bytemuck" version = "1.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c76a5792e44e4abe34d3abf15636779261d45a7450612059293d1d2cfc63422" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ecc273b49b3205b83d648f0690daa588925572cc5063745bfe547fe7ec8e1a1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.104", +] [[package]] name = "byteorder" @@ -1016,6 +1112,15 @@ dependencies = [ "displaydoc", ] +[[package]] +name = "castaway" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0abae9be0aaf9ea96a3b1b8b1b55c602ca751eba1b1500220cea4ecbafe7c0d5" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.2.27" @@ -1105,7 +1210,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93698b29de5e97ad0ae26447b344c482a7284c737d9ddc5f9e52b74a336671bb" dependencies = [ "chrono", - "chrono-tz-build", + "chrono-tz-build 0.3.0", + "phf", +] + +[[package]] +name = "chrono-tz" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efdce149c370f133a071ca8ef6ea340b7b88748ab0810097a9e2976eaa34b4f3" +dependencies = [ + "chrono", + "chrono-tz-build 0.4.1", "phf", ] @@ -1120,6 +1236,16 @@ dependencies = [ "phf_codegen", ] +[[package]] +name = "chrono-tz-build" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402" +dependencies = [ + "parse-zoneinfo", + "phf_codegen", +] + [[package]] name = "cipher" version = "0.4.4" @@ -1235,6 +1361,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "comfy-table" +version = "7.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" +dependencies = [ + "crossterm 0.28.1", + "unicode-segmentation", + "unicode-width 0.2.1", +] + [[package]] name = "commoncrypto" version = "0.2.0" @@ -1253,6 +1390,21 @@ dependencies = [ "libc", ] +[[package]] +name = "compact_str" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + [[package]] name = "concurrent-queue" version = "2.5.0" @@ -1306,6 +1458,12 @@ version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2459fc9262a1aa204eb4b5764ad4f189caec88aea9634389c0a25f8be7f6265e" +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "convert_case" version = "0.6.0" @@ -1480,6 +1638,19 @@ dependencies = [ "winapi", ] +[[package]] +name = "crossterm" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" +dependencies = [ + "bitflags 2.9.1", + "crossterm_winapi", + "parking_lot 0.12.4", + "rustix 0.38.44", + "winapi", +] + [[package]] name = "crossterm_winapi" version = "0.9.1" @@ -1714,6 +1885,12 @@ dependencies = [ "matches", ] +[[package]] +name = "debug_unsafe" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85d3cef41d236720ed453e102153a53e4cc3d2fde848c0078a50cf249e8e3e5b" + [[package]] name = "der" version = "0.7.10" @@ -2085,6 +2262,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "ethnum" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" + [[package]] name = "event-listener" version = "5.4.0" @@ -2131,6 +2314,12 @@ dependencies = [ "once_cell", ] +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + [[package]] name = "fancy-regex" version = "0.14.0" @@ -2142,6 +2331,12 @@ dependencies = [ "regex-syntax 0.8.5", ] +[[package]] +name = "fast-float2" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" + [[package]] name = "fast_chemail" version = "0.9.6" @@ -2258,6 +2453,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" dependencies = [ "crc32fast", + "libz-rs-sys", "miniz_oxide", ] @@ -2308,6 +2504,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs4" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8640e34b88f7652208ce9e88b1a37a2ae95227d84abec377ccd3c5cfeb141ed4" +dependencies = [ + "rustix 1.0.7", + "windows-sys 0.59.0", +] + [[package]] name = "fsevent-sys" version = "4.1.0" @@ -2565,9 +2771,9 @@ dependencies = [ [[package]] name = "gif" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb2d69b19215e18bb912fa30f7ce15846e301408695e44e0ef719f1da9e19f2" +checksum = "fcc37f9a2bfe731e69f1e08d29d91d30604b9ce24bcb2880a961e82d89c6ed89" dependencies = [ "color_quant", "weezl", @@ -2712,6 +2918,12 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash 0.8.12", + "allocator-api2", + "rayon", + "serde", +] [[package]] name = "hashbrown" @@ -2722,6 +2934,8 @@ dependencies = [ "allocator-api2", "equivalent", "foldhash", + "rayon", + "serde", ] [[package]] @@ -2973,6 +3187,12 @@ dependencies = [ "libm", ] +[[package]] +name = "humantime" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" + [[package]] name = "hyper" version = "1.6.0" @@ -3019,6 +3239,7 @@ dependencies = [ "hyper", "hyper-util", "rustls", + "rustls-native-certs", "rustls-pki-types", "tokio", "tokio-rustls", @@ -3678,7 +3899,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fddf93031af70e75410a2511ec04d49e758ed2f26dad3404a934e0fb45cc12a" dependencies = [ "bitflags 2.9.1", - "crossterm", + "crossterm 0.25.0", "dyn-clone", "fuzzy-matcher", "fxhash", @@ -3818,9 +4039,9 @@ dependencies = [ [[package]] name = "jpeg-decoder" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5d4a7da358eff58addd2877a45865158f0d78c911d43a5784ceb7bbf52833b0" +checksum = "00810f1d8b74be64b13dbf3db89ac67740615d6c891f0e7b6179326533011a07" [[package]] name = "jpegxl-rs" @@ -4181,6 +4402,15 @@ dependencies = [ "glob", ] +[[package]] +name = "libz-rs-sys" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "172a788537a2221661b480fee8dc5f96c580eb34fa88764d3205dc356c7e4221" +dependencies = [ + "zlib-rs", +] + [[package]] name = "lightningcss" version = "1.0.0-alpha.66" @@ -4220,6 +4450,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + [[package]] name = "linux-raw-sys" version = "0.9.4" @@ -4304,6 +4540,25 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "lz4" +version = "1.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a20b523e860d03443e98350ceaac5e71c6ba89aea7d960769ec3ce37f4de5af4" +dependencies = [ + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.11.1+lz4-1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bd8c0d6c6ed0cd30b3652886bb8711dc4bb01d637a68105a3d5158039b418e6" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "mac" version = "0.1.1" @@ -4694,6 +4949,15 @@ dependencies = [ "instant", ] +[[package]] +name = "now" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89e9874397a1f0a52fc1f197a8effd9735223cb2390e9dcc83ac6cd02923d0" +dependencies = [ + "chrono", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -4871,6 +5135,41 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781f96d79ed0f961a7021424ab01840efbda64ae7a505aaea195efc91eaaec4" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "chrono", + "form_urlencoded", + "futures", + "http", + "http-body-util", + "humantime", + "hyper", + "itertools 0.14.0", + "parking_lot 0.12.4", + "percent-encoding", + "quick-xml", + "rand 0.9.1", + "reqwest", + "ring", + "serde", + "serde_json", + "serde_urlencoded", + "thiserror 2.0.12", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -5424,6 +5723,16 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "planus" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3daf8e3d4b712abe1d690838f6e29fb76b76ea19589c4afa39ec30e12f62af71" +dependencies = [ + "array-init-cursor", + "hashbrown 0.15.4", +] + [[package]] name = "png" version = "0.17.16" @@ -5437,6 +5746,496 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "polars" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "443824f43bca39b178353d6c09e4b44e115b21f107a5654d5f980d20b432a303" +dependencies = [ + "getrandom 0.2.16", + "polars-arrow", + "polars-core", + "polars-error", + "polars-io", + "polars-lazy", + "polars-ops", + "polars-parquet", + "polars-sql", + "polars-time", + "polars-utils", + "version_check", +] + +[[package]] +name = "polars-arrow" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809c5340e9e6c16eee5a07585161bae99f903f53af7402075efec23ee75fce5b" +dependencies = [ + "atoi_simd", + "bitflags 2.9.1", + "bytemuck", + "chrono", + "chrono-tz 0.10.3", + "dyn-clone", + "either", + "ethnum", + "getrandom 0.2.16", + "hashbrown 0.15.4", + "itoa", + "lz4", + "num-traits", + "polars-arrow-format", + "polars-error", + "polars-schema", + "polars-utils", + "serde", + "simdutf8", + "streaming-iterator", + "strum_macros", + "version_check", + "zstd", +] + +[[package]] +name = "polars-arrow-format" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "863c04c514be005eced7db7053e20d49f7e7a58048a282fa52dfea1fd5434e78" +dependencies = [ + "planus", + "serde", +] + +[[package]] +name = "polars-compute" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b8802ff2cccea01a845ea8267a7600e495747ed109035bb5020c33eb8717ff4" +dependencies = [ + "atoi_simd", + "bytemuck", + "chrono", + "either", + "fast-float2", + "hashbrown 0.15.4", + "itoa", + "num-traits", + "polars-arrow", + "polars-error", + "polars-utils", + "rand 0.8.5", + "ryu", + "serde", + "skiplist", + "strength_reduce", + "strum_macros", + "version_check", +] + +[[package]] +name = "polars-core" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fc3c99d7000be1be11665e1e260b93dc3b927342b9da3b53d9a1ac264e4343d" +dependencies = [ + "bitflags 2.9.1", + "boxcar", + "bytemuck", + "chrono", + "chrono-tz 0.10.3", + "comfy-table", + "either", + "hashbrown 0.14.5", + "hashbrown 0.15.4", + "indexmap 2.9.0", + "itoa", + "num-traits", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-row", + "polars-schema", + "polars-utils", + "rand 0.8.5", + "rand_distr", + "rayon", + "regex", + "serde", + "serde_json", + "strum_macros", + "uuid", + "version_check", + "xxhash-rust", +] + +[[package]] +name = "polars-error" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1397c17712e61a55fdd45c033a69f0451fde2973ff2609c22e363e21d68f11ef" +dependencies = [ + "object_store", + "parking_lot 0.12.4", + "polars-arrow-format", + "regex", + "signal-hook", + "simdutf8", +] + +[[package]] +name = "polars-expr" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33d3aa6722c9a3e0b721ec2bcdc4affd9e50e4cb606cd81bb94535a9a5a6ade9" +dependencies = [ + "bitflags 2.9.1", + "hashbrown 0.15.4", + "num-traits", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-io", + "polars-ops", + "polars-plan", + "polars-row", + "polars-time", + "polars-utils", + "rand 0.8.5", + "rayon", + "recursive", +] + +[[package]] +name = "polars-io" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a632d442a99821250a8fa66f7d488bf5ee98e5f515e65256b12956cb81fc110" +dependencies = [ + "async-trait", + "atoi_simd", + "blake3", + "bytes", + "chrono", + "fast-float2", + "fs4", + "futures", + "glob", + "hashbrown 0.15.4", + "home", + "itoa", + "memchr", + "memmap2 0.9.5", + "num-traits", + "object_store", + "percent-encoding", + "polars-arrow", + "polars-core", + "polars-error", + "polars-parquet", + "polars-schema", + "polars-time", + "polars-utils", + "rayon", + "regex", + "reqwest", + "ryu", + "serde", + "serde_json", + "simdutf8", + "tokio", + "tokio-util", + "url", +] + +[[package]] +name = "polars-lazy" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4ed0c87bdc8820447a38ae8efdb5a51a5a93e8bd528cffb05d05cf1145e4161" +dependencies = [ + "bitflags 2.9.1", + "chrono", + "either", + "memchr", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-expr", + "polars-io", + "polars-mem-engine", + "polars-ops", + "polars-plan", + "polars-stream", + "polars-time", + "polars-utils", + "rayon", + "version_check", +] + +[[package]] +name = "polars-mem-engine" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "675294ddf9174029e48caa4e59b0665ea64bfb784a366b197690895a6ed65c68" +dependencies = [ + "futures", + "memmap2 0.9.5", + "polars-arrow", + "polars-core", + "polars-error", + "polars-expr", + "polars-io", + "polars-ops", + "polars-plan", + "polars-time", + "polars-utils", + "rayon", + "recursive", + "tokio", +] + +[[package]] +name = "polars-ops" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1eb4db68956f857c52eeda072d87644a7b42eac41d55073af94dfac8441af6cf" +dependencies = [ + "argminmax", + "base64 0.22.1", + "bytemuck", + "chrono", + "chrono-tz 0.10.3", + "either", + "hashbrown 0.15.4", + "hex 0.4.3", + "indexmap 2.9.0", + "libm", + "memchr", + "num-traits", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-error", + "polars-schema", + "polars-utils", + "rayon", + "regex", + "regex-syntax 0.8.5", + "strum_macros", + "unicode-normalization", + "unicode-reverse", + "version_check", +] + +[[package]] +name = "polars-parquet" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c849c10edd9511ccd4ec4130e283ee3a8b3bb48a7d74ac6354c1c20add81065" +dependencies = [ + "async-stream", + "base64 0.22.1", + "brotli 7.0.0", + "bytemuck", + "ethnum", + "flate2", + "futures", + "hashbrown 0.15.4", + "lz4", + "num-traits", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-parquet-format", + "polars-utils", + "serde", + "simdutf8", + "snap", + "streaming-decompression", + "zstd", +] + +[[package]] +name = "polars-parquet-format" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c025243dcfe8dbc57e94d9f82eb3bef10b565ab180d5b99bed87fd8aea319ce1" +dependencies = [ + "async-trait", + "futures", +] + +[[package]] +name = "polars-plan" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71fb4412c42bf637c2c02a617381c682ed425d9c8e4bd1fcb85cf352ed2a67c6" +dependencies = [ + "bitflags 2.9.1", + "bytemuck", + "bytes", + "chrono", + "chrono-tz 0.10.3", + "either", + "futures", + "hashbrown 0.15.4", + "memmap2 0.9.5", + "num-traits", + "percent-encoding", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-io", + "polars-ops", + "polars-parquet", + "polars-time", + "polars-utils", + "rayon", + "recursive", + "regex", + "strum_macros", + "version_check", +] + +[[package]] +name = "polars-row" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08fb77ac1d37340d9cfe57cf58000cf3d9cce429e10d25066952c6145c684cc0" +dependencies = [ + "bitflags 2.9.1", + "bytemuck", + "polars-arrow", + "polars-compute", + "polars-error", + "polars-utils", +] + +[[package]] +name = "polars-schema" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ada7c7e2fbbeffbdd67628cd8a89f02b0a8d21c71d34e297e2463a7c17575203" +dependencies = [ + "indexmap 2.9.0", + "polars-error", + "polars-utils", + "serde", + "version_check", +] + +[[package]] +name = "polars-sql" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a8e512b1f05ffda9963fe8f6a7c62dcba86be85218bc033ecdad2802cc1b1a0" +dependencies = [ + "bitflags 2.9.1", + "hex 0.4.3", + "polars-core", + "polars-error", + "polars-lazy", + "polars-ops", + "polars-plan", + "polars-time", + "polars-utils", + "rand 0.8.5", + "regex", + "serde", + "sqlparser", +] + +[[package]] +name = "polars-stream" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0a02d8050acd9b64ed7e36c5bc96f6d4f46a940220f9c0e34c96b51f830f8c" +dependencies = [ + "async-channel", + "async-trait", + "atomic-waker", + "bitflags 2.9.1", + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-queue", + "crossbeam-utils", + "futures", + "memmap2 0.9.5", + "parking_lot 0.12.4", + "percent-encoding", + "pin-project-lite", + "polars-arrow", + "polars-core", + "polars-error", + "polars-expr", + "polars-io", + "polars-mem-engine", + "polars-ops", + "polars-parquet", + "polars-plan", + "polars-utils", + "rand 0.8.5", + "rayon", + "recursive", + "slotmap", + "tokio", + "version_check", +] + +[[package]] +name = "polars-time" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72e84a30110880ffede8d93c085fc429ab1b8bf1acf3d6d489143dd34be374c4" +dependencies = [ + "atoi_simd", + "bytemuck", + "chrono", + "chrono-tz 0.10.3", + "now", + "num-traits", + "polars-arrow", + "polars-compute", + "polars-core", + "polars-error", + "polars-ops", + "polars-utils", + "rayon", + "regex", + "strum_macros", +] + +[[package]] +name = "polars-utils" +version = "0.49.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a05e033960552c47fc35afe14d5af5b29696acc97ae5d3c585ebc33c246cc15f" +dependencies = [ + "bincode", + "bytemuck", + "bytes", + "compact_str", + "flate2", + "foldhash", + "hashbrown 0.15.4", + "indexmap 2.9.0", + "libc", + "memmap2 0.9.5", + "num-traits", + "polars-error", + "rand 0.8.5", + "raw-cpuid", + "rayon", + "regex", + "rmp-serde", + "serde", + "serde_json", + "slotmap", + "stacker", + "version_check", +] + [[package]] name = "poly1305" version = "0.8.0" @@ -5561,18 +6360,18 @@ dependencies = [ [[package]] name = "profiling" -version = "1.0.16" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afbdc74edc00b6f6a218ca6a5364d6226a259d4b8ea1af4a0ea063f27e179f4d" +checksum = "3eb8486b569e12e2c32ad3e204dbaba5e4b5b216e9367044f25f1dba42341773" dependencies = [ "profiling-procmacros", ] [[package]] name = "profiling-procmacros" -version = "1.0.16" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a65f2e60fbf1063868558d69c6beacf412dc755f9fc020f514b7955fc914fe30" +checksum = "52717f9a02b6965224f95ca2a81e2e0c5c43baacd28ca057577988930b6c3d5b" dependencies = [ "quote", "syn 2.0.104", @@ -5594,6 +6393,15 @@ version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" +[[package]] +name = "psm" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f" +dependencies = [ + "cc", +] + [[package]] name = "ptr_meta" version = "0.1.4" @@ -5833,6 +6641,16 @@ dependencies = [ "getrandom 0.3.3", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + [[package]] name = "rav1e" version = "0.7.1" @@ -5965,6 +6783,7 @@ dependencies = [ "opendal", "openidconnect", "percent-encoding", + "polars", "quirks_path", "rand 0.9.1", "regex", @@ -6001,6 +6820,26 @@ dependencies = [ "webp", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.104", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -6067,7 +6906,7 @@ checksum = "78c81d000a2c524133cc00d2f92f019d399e57906c3b7119271a2495354fe895" dependencies = [ "cfg-if", "libc", - "rustix", + "rustix 1.0.7", "windows 0.61.3", ] @@ -6160,6 +6999,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls", + "rustls-native-certs", "rustls-pki-types", "serde", "serde_json", @@ -6324,6 +7164,28 @@ dependencies = [ "libc", ] +[[package]] +name = "rmp" +version = "0.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "228ed7c16fa39782c3b3468e974aec2795e9089153cd08ee2e9aefb3613334c4" +dependencies = [ + "byteorder", + "num-traits", + "paste", +] + +[[package]] +name = "rmp-serde" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52e599a477cf9840e92f2cde9a7189e67b42c57532749bf90aea6ec10facd4db" +dependencies = [ + "byteorder", + "rmp", + "serde", +] + [[package]] name = "rsa" version = "0.9.8" @@ -6429,6 +7291,19 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags 2.9.1", + "errno", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + [[package]] name = "rustix" version = "1.0.7" @@ -6438,7 +7313,7 @@ dependencies = [ "bitflags 2.9.1", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.9.4", "windows-sys 0.59.0", ] @@ -7174,12 +8049,30 @@ dependencies = [ "num", ] +[[package]] +name = "skiplist" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eec25f46463fcdc5e02f388c2780b1b58e01be81a8378e62ec60931beccc3f6" +dependencies = [ + "rand 0.8.5", +] + [[package]] name = "slab" version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04dc19736151f35336d325007ac991178d504a119863a2fcb3758cdb5e52c50d" +[[package]] +name = "slotmap" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbff4acf519f630b3a3ddcfaea6c06b42174d9a44bc70c620e9ed1649d58b82a" +dependencies = [ + "version_check", +] + [[package]] name = "slug" version = "0.1.6" @@ -7222,6 +8115,12 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + [[package]] name = "socket2" version = "0.5.10" @@ -7260,6 +8159,15 @@ dependencies = [ "der", ] +[[package]] +name = "sqlparser" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" +dependencies = [ + "log", +] + [[package]] name = "sqlx" version = "0.8.6" @@ -7490,6 +8398,19 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "stacker" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -7502,6 +8423,27 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7beae5182595e9a8b683fa98c4317f956c9a2dec3b9716990d20023cc60c766" +[[package]] +name = "streaming-decompression" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf6cc3b19bfb128a8ad11026086e31d3ce9ad23f8ea37354b31383a187c44cf3" +dependencies = [ + "fallible-streaming-iterator", +] + +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + [[package]] name = "string_cache" version = "0.8.9" @@ -7698,7 +8640,7 @@ dependencies = [ "fastrand", "getrandom 0.3.3", "once_cell", - "rustix", + "rustix 1.0.7", "windows-sys 0.59.0", ] @@ -7720,7 +8662,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab9d851b45e865f178319da0abdbfe6acbc4328759ff18dafc3a41c16b4cd2ee" dependencies = [ "chrono", - "chrono-tz", + "chrono-tz 0.9.0", "globwalk", "humansize", "lazy_static", @@ -8430,6 +9372,15 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" +[[package]] +name = "unicode-reverse" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b6f4888ebc23094adfb574fdca9fdc891826287a6397d2cd28802ffd6f20c76" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -9222,7 +10173,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d65cbf2f12c15564212d48f4e3dfb87923d25d611f2aed18f4cb23f0413d89e" dependencies = [ "libc", - "rustix", + "rustix 1.0.7", ] [[package]] @@ -9341,6 +10292,12 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "zlib-rs" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "626bd9fa9734751fc50d6060752170984d7053f5a39061f524cda68023d4db8a" + [[package]] name = "zstd" version = "0.13.3" @@ -9386,9 +10343,9 @@ dependencies = [ [[package]] name = "zune-jpeg" -version = "0.4.17" +version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f6fe2e33d02a98ee64423802e16df3de99c43e5cf5ff983767e1128b394c8ac" +checksum = "7384255a918371b5af158218d131530f694de9ad3815ebdd0453a940485cb0fa" dependencies = [ "zune-core", ] diff --git a/apps/recorder/Cargo.toml b/apps/recorder/Cargo.toml index 69cbe91..4cb8713 100644 --- a/apps/recorder/Cargo.toml +++ b/apps/recorder/Cargo.toml @@ -2,8 +2,20 @@ name = "recorder" version = "0.1.0" edition = "2024" - # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +default = ["jxl"] +playground = ["dep:inquire", "dep:color-eyre", "dep:polars"] +testcontainers = [ + "dep:testcontainers", + "dep:testcontainers-modules", + "dep:testcontainers-ext", + "downloader/testcontainers", + "testcontainers-modules/postgres", +] +jxl = ["dep:jpegxl-rs", "dep:jpegxl-sys"] + [lib] name = "recorder" path = "src/lib.rs" @@ -13,17 +25,25 @@ name = "recorder_cli" path = "src/bin/main.rs" required-features = [] -[features] -default = ["jxl"] -playground = ["dep:inquire", "dep:color-eyre"] -testcontainers = [ - "dep:testcontainers", - "dep:testcontainers-modules", - "dep:testcontainers-ext", - "downloader/testcontainers", - "testcontainers-modules/postgres", -] -jxl = ["dep:jpegxl-rs", "dep:jpegxl-sys"] +[[example]] +name = "mikan_collect_classic_eps" +path = "examples/mikan_collect_classic_eps.rs" +required-features = ["playground"] + +[[example]] +name = "mikan_doppel_season_subscription" +path = "examples/mikan_doppel_season_subscription.rs" +required-features = ["playground"] + +[[example]] +name = "mikan_doppel_subscriber_subscription" +path = "examples/mikan_doppel_subscriber_subscription.rs" +required-features = ["playground"] + +[[example]] +name = "playground" +path = "examples/playground.rs" +required-features = ["playground"] [dependencies] downloader = { workspace = true } @@ -93,7 +113,7 @@ fancy-regex = "0.14" lightningcss = "1.0.0-alpha.66" html-escape = "0.2.13" opendal = { version = "0.53", features = ["default", "services-fs"] } -scraper = "0.23" +scraper = "0.23.1" async-graphql = { version = "7", features = ["dynamic-schema"] } async-graphql-axum = "7" seaography = { version = "1.1", features = [ @@ -134,11 +154,11 @@ icu = "2.0.0" tracing-tree = "0.4.0" num_cpus = "1.17.0" headers-accept = "0.1.4" +polars = { version = "0.49.1", features = ["parquet"], optional = true } [dev-dependencies] inquire = { workspace = true } color-eyre = { workspace = true } - serial_test = "3" insta = { version = "1", features = ["redactions", "toml", "filters"] } rstest = "0.25" diff --git a/apps/recorder/examples/mikan_collect_classic_eps.rs b/apps/recorder/examples/mikan_collect_classic_eps.rs new file mode 100644 index 0000000..c17604e --- /dev/null +++ b/apps/recorder/examples/mikan_collect_classic_eps.rs @@ -0,0 +1,443 @@ +use std::collections::HashSet; + +use chrono::{DateTime, Duration, FixedOffset, NaiveDate, NaiveTime, TimeZone, Utc}; +use fetch::{HttpClientConfig, fetch_html}; +use lazy_static::lazy_static; +use nom::{ + IResult, Parser, + branch::alt, + bytes::complete::{tag, take, take_till1}, + character::complete::space1, + combinator::map, +}; +use recorder::{ + errors::{RecorderError, RecorderResult}, + extract::{ + html::extract_inner_text_from_element_ref, + mikan::{MikanClient, MikanConfig, MikanEpisodeHash, MikanFansubHash}, + }, +}; +use regex::Regex; +use scraper::{ElementRef, Html, Selector}; +use snafu::FromString; +use url::Url; + +lazy_static! { + static ref TEST_FOLDER: std::path::PathBuf = + if cfg!(any(test, debug_assertions, feature = "playground")) { + std::path::PathBuf::from(format!( + "{}/tests/resources/mikan/classic_episodes", + env!("CARGO_MANIFEST_DIR") + )) + } else { + std::path::PathBuf::from("tests/resources/mikan/classic_episodes") + }; +} + +lazy_static! { + static ref TOTAL_PAGE_REGEX: Regex = + Regex::new(r#"\$\(\'\.classic-view-pagination2\'\)\.bootpag\(\{\s*total:\s*(\d+)"#) + .unwrap(); +} + +pub struct MikanClassicEpisodeTableRow { + pub id: i32, + pub publish_at: DateTime, + pub mikan_fansub_id: Option, + pub fansub_name: Option, + pub mikan_episode_id: String, + pub original_name: String, + pub magnet_link: Option, + pub file_size: Option, + pub torrent_link: Option, +} + +impl MikanClassicEpisodeTableRow { + fn timezone() -> FixedOffset { + FixedOffset::east_opt(8 * 3600).unwrap() + } + + fn fixed_date_parser(input: &str) -> IResult<&str, NaiveDate> { + alt(( + map(tag("今天"), move |_| { + Utc::now().with_timezone(&Self::timezone()).date_naive() + }), + map(tag("昨天"), move |_| { + Utc::now().with_timezone(&Self::timezone()).date_naive() - Duration::days(1) + }), + )) + .parse(input) + } + + fn formatted_date_parser(input: &str) -> IResult<&str, NaiveDate> { + let (remain, date_str) = take_till1(|c: char| c.is_whitespace()).parse(input)?; + let date = NaiveDate::parse_from_str(date_str, "%Y/%m/%d").map_err(|_| { + nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Verify)) + })?; + Ok((remain, date)) + } + + fn date_parser(input: &str) -> IResult<&str, NaiveDate> { + alt((Self::fixed_date_parser, Self::formatted_date_parser)).parse(input) + } + + fn time_parser(input: &str) -> IResult<&str, NaiveTime> { + let (remain, time_str) = take(5usize).parse(input)?; + let time = NaiveTime::parse_from_str(time_str, "%H:%M").map_err(|_| { + nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Verify)) + })?; + Ok((remain, time)) + } + + fn extract_publish_at(text: &str) -> Option> { + let (_, (date, _, time)) = (Self::date_parser, space1, Self::time_parser) + .parse(text) + .ok()?; + let local_dt = Self::timezone() + .from_local_datetime(&date.and_time(time)) + .single()?; + Some(local_dt.with_timezone(&Utc)) + } + + pub fn from_element_ref( + row: ElementRef<'_>, + rev_id: i32, + idx: i32, + mikan_base_url: &Url, + ) -> RecorderResult { + let publish_at_selector = &Selector::parse("td:nth-of-type(1)").unwrap(); + let fansub_selector = &Selector::parse("td:nth-of-type(2) > a").unwrap(); + let original_name_selector = + &Selector::parse("td:nth-of-type(3) > a:nth-of-type(1)").unwrap(); + let magnet_link_selector = + &Selector::parse("td:nth-of-type(3) > a:nth-of-type(2)").unwrap(); + let file_size_selector = &Selector::parse("td:nth-of-type(4)").unwrap(); + let torrent_link_selector = &Selector::parse("td:nth-of-type(5) > a").unwrap(); + + let publish_at = row + .select(publish_at_selector) + .next() + .map(extract_inner_text_from_element_ref) + .and_then(|e| Self::extract_publish_at(&e)); + + let (mikan_fansub_hash, fansub_name) = row + .select(fansub_selector) + .next() + .and_then(|e| { + e.attr("href") + .and_then(|s| mikan_base_url.join(s).ok()) + .and_then(|u| MikanFansubHash::from_homepage_url(&u)) + .map(|h| (h, extract_inner_text_from_element_ref(e))) + }) + .unzip(); + + let (mikan_episode_hash, original_name) = row + .select(original_name_selector) + .next() + .and_then(|el| { + el.attr("href") + .and_then(|s| mikan_base_url.join(s).ok()) + .and_then(|u| MikanEpisodeHash::from_homepage_url(&u)) + .map(|h| (h, extract_inner_text_from_element_ref(el))) + }) + .unzip(); + + let magnet_link = row + .select(magnet_link_selector) + .next() + .and_then(|el| el.attr("data-clipboard-text")); + + let file_size = row + .select(file_size_selector) + .next() + .map(extract_inner_text_from_element_ref); + + let torrent_link = row + .select(torrent_link_selector) + .next() + .and_then(|el| el.attr("href")); + + if let (Some(mikan_episode_hash), Some(original_name), Some(publish_at)) = ( + mikan_episode_hash.as_ref(), + original_name.as_ref(), + publish_at.as_ref(), + ) { + Ok(Self { + id: rev_id * 1000 + idx, + publish_at: *publish_at, + mikan_fansub_id: mikan_fansub_hash.map(|h| h.mikan_fansub_id.clone()), + fansub_name, + mikan_episode_id: mikan_episode_hash.mikan_episode_id.clone(), + original_name: original_name.clone(), + magnet_link: magnet_link.map(|s| s.to_string()), + file_size: file_size.map(|s| s.to_string()), + torrent_link: torrent_link.map(|s| s.to_string()), + }) + } else { + let mut missing_fields = vec![]; + if mikan_episode_hash.is_none() { + missing_fields.push("mikan_episode_id"); + } + if original_name.is_none() { + missing_fields.push("original_name"); + } + if publish_at.is_none() { + missing_fields.push("publish_at"); + } + Err(RecorderError::without_source(format!( + "Failed to parse episode table row, missing fields: {missing_fields:?}, row \ + index: {idx}" + ))) + } + } +} + +pub struct MikanClassicEpisodeTablePage { + pub page: i32, + pub total: i32, + pub html: String, + pub rows: Vec, +} + +impl MikanClassicEpisodeTablePage { + pub fn from_html( + html: String, + mikan_base_url: &Url, + page: i32, + updated_info: Option<(i32, i32)>, + ) -> RecorderResult { + let tr_selector = &Selector::parse("tbody tr").unwrap(); + let doc = Html::parse_document(&html); + if let Some(mut total) = TOTAL_PAGE_REGEX + .captures(&html) + .and_then(|c| c.get(1)) + .and_then(|s| s.as_str().parse::().ok()) + { + if let Some((_, update_total)) = updated_info { + total = update_total; + } + + let rev_id = total - page; + let rows = doc + .select(tr_selector) + .rev() + .enumerate() + .map(|(idx, tr)| { + MikanClassicEpisodeTableRow::from_element_ref( + tr, + rev_id, + idx as i32, + mikan_base_url, + ) + }) + .collect::>>()?; + Ok(Self { + page, + total, + html, + rows, + }) + } else { + Err(RecorderError::without_source( + "Failed to parse pagination meta and rows".into(), + )) + } + } + + pub fn save_to_files(&self) -> RecorderResult<()> { + use polars::prelude::*; + + let rev_id = self.total - self.page; + let parquet_path = TEST_FOLDER.join(format!("parquet/rev_{rev_id}.parquet")); + let csv_path = TEST_FOLDER.join(format!("csv/rev_{rev_id}.csv")); + let html_path = TEST_FOLDER.join(format!("html/rev_{rev_id}.html")); + + std::fs::write(html_path, self.html.clone())?; + + let mut publish_at_vec = Vec::new(); + let mut mikan_fansub_id_vec = Vec::new(); + let mut fansub_name_vec = Vec::new(); + let mut mikan_episode_id_vec = Vec::new(); + let mut original_name_vec = Vec::new(); + let mut magnet_link_vec = Vec::new(); + let mut file_size_vec = Vec::new(); + let mut torrent_link_vec = Vec::new(); + + for row in &self.rows { + publish_at_vec.push(row.publish_at.to_rfc3339()); + mikan_fansub_id_vec.push(row.mikan_fansub_id.clone()); + fansub_name_vec.push(row.fansub_name.clone()); + mikan_episode_id_vec.push(row.mikan_episode_id.clone()); + original_name_vec.push(row.original_name.clone()); + magnet_link_vec.push(row.magnet_link.clone()); + file_size_vec.push(row.file_size.clone()); + torrent_link_vec.push(row.torrent_link.clone()); + } + + let df = df! [ + "publish_at_timestamp" => publish_at_vec, + "mikan_fansub_id" => mikan_fansub_id_vec, + "fansub_name" => fansub_name_vec, + "mikan_episode_id" => mikan_episode_id_vec, + "original_name" => original_name_vec, + "magnet_link" => magnet_link_vec, + "file_size" => file_size_vec, + "torrent_link" => torrent_link_vec, + ] + .map_err(|e| { + let message = format!("Failed to create DataFrame: {e}"); + RecorderError::with_source(Box::new(e), message) + })?; + + let mut parquet_file = std::fs::File::create(&parquet_path)?; + + ParquetWriter::new(&mut parquet_file) + .finish(&mut df.clone()) + .map_err(|e| { + let message = format!("Failed to write parquet file: {e}"); + RecorderError::with_source(Box::new(e), message) + })?; + + let mut csv_file = std::fs::File::create(&csv_path)?; + + CsvWriter::new(&mut csv_file) + .include_header(true) + .with_quote_style(QuoteStyle::Always) + .finish(&mut df.clone()) + .map_err(|e| { + let message = format!("Failed to write csv file: {e}"); + RecorderError::with_source(Box::new(e), message) + })?; + + println!( + "[{}/{}] Saved {} rows to rev_{}.{{parquet,html,csv}}", + self.page, + self.total, + self.rows.len(), + rev_id + ); + + Ok(()) + } + + pub fn waiting_rev_ids(total: i32) -> RecorderResult> { + let dir = TEST_FOLDER.join("csv"); + + let files = std::fs::read_dir(dir)?; + + let rev_ids = files + .filter_map(|f| f.ok()) + .filter_map(|f| { + f.path().file_stem().and_then(|s| { + s.to_str().and_then(|s| { + if s.starts_with("rev_") { + s.replace("rev_", "").parse::().ok() + } else { + None + } + }) + }) + }) + .collect::>(); + + Ok((0..total) + .filter(|rev_id| !rev_ids.contains(rev_id)) + .collect::>()) + } +} + +async fn scrape_mikan_classic_episode_table_page( + mikan_client: &MikanClient, + page: i32, + updated_info: Option<(i32, i32)>, +) -> RecorderResult { + let mikan_base_url = mikan_client.base_url(); + let url = mikan_base_url.join(&format!("/Home/Classic/{page}"))?; + + if let Some((rev_id, update_total)) = updated_info.as_ref() { + let html_path = TEST_FOLDER.join(format!("html/rev_{rev_id}.html")); + if html_path.exists() { + let html = std::fs::read_to_string(&html_path)?; + println!("[{page}/{update_total}] html exists, skipping fetch"); + return MikanClassicEpisodeTablePage::from_html( + html, + mikan_base_url, + page, + updated_info, + ); + } + } + + let total = if let Some((_, update_total)) = updated_info.as_ref() { + update_total.to_string() + } else { + "Unknown".to_string() + }; + + println!("[{page}/{total}] fetching html..."); + + let html = fetch_html(mikan_client, url).await?; + + println!("[{page}/{total}] fetched html done"); + + std::fs::write(TEST_FOLDER.join("html/temp.html"), html.clone())?; + + MikanClassicEpisodeTablePage::from_html(html, mikan_base_url, page, updated_info) +} + +async fn scrape_mikan_classic_episode_table_page_from_rev_id( + mikan_client: &MikanClient, + total: i32, + rev_idx: i32, +) -> RecorderResult { + let page = total - rev_idx; + + scrape_mikan_classic_episode_table_page(mikan_client, page, Some((rev_idx, total))).await +} + +#[tokio::main] +async fn main() -> RecorderResult<()> { + std::fs::create_dir_all(TEST_FOLDER.join("html"))?; + std::fs::create_dir_all(TEST_FOLDER.join("parquet"))?; + std::fs::create_dir_all(TEST_FOLDER.join("csv"))?; + + let mikan_scrape_client = MikanClient::from_config(MikanConfig { + http_client: HttpClientConfig { + exponential_backoff_max_retries: Some(3), + leaky_bucket_max_tokens: Some(2), + leaky_bucket_initial_tokens: Some(0), + leaky_bucket_refill_tokens: Some(1), + leaky_bucket_refill_interval: Some(std::time::Duration::from_millis(1000)), + user_agent: Some( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \ + Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0" + .to_string(), + ), + ..Default::default() + }, + base_url: Url::parse("https://mikanani.me")?, + }) + .await?; + + let first_page_and_pagination_info = + scrape_mikan_classic_episode_table_page(&mikan_scrape_client, 1, None).await?; + + let total_page = first_page_and_pagination_info.total; + + first_page_and_pagination_info.save_to_files()?; + + let next_rev_ids = MikanClassicEpisodeTablePage::waiting_rev_ids(total_page)?; + + for todo_rev_id in next_rev_ids { + let page = scrape_mikan_classic_episode_table_page_from_rev_id( + &mikan_scrape_client, + total_page, + todo_rev_id, + ) + .await?; + + page.save_to_files()?; + } + + Ok(()) +} diff --git a/apps/recorder/src/extract/mikan/constants.rs b/apps/recorder/src/extract/mikan/constants.rs index 83ba0a6..6a5ae2b 100644 --- a/apps/recorder/src/extract/mikan/constants.rs +++ b/apps/recorder/src/extract/mikan/constants.rs @@ -12,6 +12,7 @@ pub const MIKAN_BANGUMI_POSTER_PATH: &str = "/images/Bangumi"; pub const MIKAN_EPISODE_TORRENT_PATH: &str = "/Download"; pub const MIKAN_SUBSCRIBER_SUBSCRIPTION_RSS_PATH: &str = "/RSS/MyBangumi"; pub const MIKAN_BANGUMI_RSS_PATH: &str = "/RSS/Bangumi"; +pub const MIKAN_FANSUB_HOMEPAGE_PATH: &str = "/Home/PublishGroup"; pub const MIKAN_BANGUMI_ID_QUERY_KEY: &str = "bangumiId"; pub const MIKAN_FANSUB_ID_QUERY_KEY: &str = "subgroupid"; pub const MIKAN_SUBSCRIBER_SUBSCRIPTION_TOKEN_QUERY_KEY: &str = "token"; diff --git a/apps/recorder/src/extract/mikan/mod.rs b/apps/recorder/src/extract/mikan/mod.rs index 0ed0456..e8c48c4 100644 --- a/apps/recorder/src/extract/mikan/mod.rs +++ b/apps/recorder/src/extract/mikan/mod.rs @@ -11,10 +11,11 @@ pub use constants::{ MIKAN_ACCOUNT_MANAGE_PAGE_PATH, MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH, MIKAN_BANGUMI_HOMEPAGE_PATH, MIKAN_BANGUMI_ID_QUERY_KEY, MIKAN_BANGUMI_POSTER_PATH, MIKAN_BANGUMI_RSS_PATH, MIKAN_EPISODE_HOMEPAGE_PATH, MIKAN_EPISODE_TORRENT_PATH, - MIKAN_FANSUB_ID_QUERY_KEY, MIKAN_LOGIN_PAGE_PATH, MIKAN_LOGIN_PAGE_SEARCH, - MIKAN_POSTER_BUCKET_KEY, MIKAN_SEASON_FLOW_PAGE_PATH, MIKAN_SEASON_STR_QUERY_KEY, - MIKAN_SUBSCRIBER_SUBSCRIPTION_RSS_PATH, MIKAN_SUBSCRIBER_SUBSCRIPTION_TOKEN_QUERY_KEY, - MIKAN_UNKNOWN_FANSUB_ID, MIKAN_UNKNOWN_FANSUB_NAME, MIKAN_YEAR_QUERY_KEY, + MIKAN_FANSUB_HOMEPAGE_PATH, MIKAN_FANSUB_ID_QUERY_KEY, MIKAN_LOGIN_PAGE_PATH, + MIKAN_LOGIN_PAGE_SEARCH, MIKAN_POSTER_BUCKET_KEY, MIKAN_SEASON_FLOW_PAGE_PATH, + MIKAN_SEASON_STR_QUERY_KEY, MIKAN_SUBSCRIBER_SUBSCRIPTION_RSS_PATH, + MIKAN_SUBSCRIBER_SUBSCRIPTION_TOKEN_QUERY_KEY, MIKAN_UNKNOWN_FANSUB_ID, + MIKAN_UNKNOWN_FANSUB_NAME, MIKAN_YEAR_QUERY_KEY, }; pub use credential::MikanCredentialForm; pub use subscription::{ @@ -22,11 +23,12 @@ pub use subscription::{ }; pub use web::{ MikanBangumiHash, MikanBangumiIndexHash, MikanBangumiIndexMeta, MikanBangumiMeta, - MikanBangumiPosterMeta, MikanEpisodeHash, MikanEpisodeMeta, MikanRssEpisodeItem, - MikanSeasonFlowUrlMeta, MikanSeasonStr, MikanSubscriberSubscriptionRssUrlMeta, - build_mikan_bangumi_expand_subscribed_url, build_mikan_bangumi_homepage_url, - build_mikan_bangumi_subscription_rss_url, build_mikan_episode_homepage_url, - build_mikan_season_flow_url, build_mikan_subscriber_subscription_rss_url, + MikanBangumiPosterMeta, MikanEpisodeHash, MikanEpisodeMeta, MikanFansubHash, + MikanRssEpisodeItem, MikanSeasonFlowUrlMeta, MikanSeasonStr, + MikanSubscriberSubscriptionRssUrlMeta, build_mikan_bangumi_expand_subscribed_url, + build_mikan_bangumi_homepage_url, build_mikan_bangumi_subscription_rss_url, + build_mikan_episode_homepage_url, build_mikan_season_flow_url, + build_mikan_subscriber_subscription_rss_url, extract_mikan_bangumi_index_meta_list_from_season_flow_fragment, extract_mikan_bangumi_meta_from_expand_subscribed_fragment, extract_mikan_episode_meta_from_episode_homepage_html, diff --git a/apps/recorder/src/extract/mikan/web.rs b/apps/recorder/src/extract/mikan/web.rs index 8a42385..deb1b0d 100644 --- a/apps/recorder/src/extract/mikan/web.rs +++ b/apps/recorder/src/extract/mikan/web.rs @@ -22,8 +22,8 @@ use crate::{ mikan::{ MIKAN_BANGUMI_EXPAND_SUBSCRIBED_PAGE_PATH, MIKAN_BANGUMI_HOMEPAGE_PATH, MIKAN_BANGUMI_ID_QUERY_KEY, MIKAN_BANGUMI_POSTER_PATH, MIKAN_BANGUMI_RSS_PATH, - MIKAN_EPISODE_HOMEPAGE_PATH, MIKAN_FANSUB_ID_QUERY_KEY, MIKAN_POSTER_BUCKET_KEY, - MIKAN_SEASON_FLOW_PAGE_PATH, MIKAN_SEASON_STR_QUERY_KEY, + MIKAN_EPISODE_HOMEPAGE_PATH, MIKAN_FANSUB_HOMEPAGE_PATH, MIKAN_FANSUB_ID_QUERY_KEY, + MIKAN_POSTER_BUCKET_KEY, MIKAN_SEASON_FLOW_PAGE_PATH, MIKAN_SEASON_STR_QUERY_KEY, MIKAN_SUBSCRIBER_SUBSCRIPTION_RSS_PATH, MIKAN_SUBSCRIBER_SUBSCRIPTION_TOKEN_QUERY_KEY, MIKAN_YEAR_QUERY_KEY, MikanClient, }, @@ -205,6 +205,32 @@ impl MikanBangumiMeta { } } +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub struct MikanFansubHash { + pub mikan_fansub_id: String, +} + +impl MikanFansubHash { + pub fn from_homepage_url(url: &Url) -> Option { + let path = url.path(); + if path.starts_with(MIKAN_FANSUB_HOMEPAGE_PATH) { + let mikan_fansub_id = path.replace(&format!("{MIKAN_FANSUB_HOMEPAGE_PATH}/"), ""); + Some(Self { mikan_fansub_id }) + } else { + None + } + } + + pub fn build_homepage_url(self, mikan_base_url: Url) -> Url { + let mut url = mikan_base_url; + url.set_path(&format!( + "{MIKAN_FANSUB_HOMEPAGE_PATH}/{}", + self.mikan_fansub_id + )); + url + } +} + #[derive(Clone, Debug, PartialEq)] pub struct MikanEpisodeMeta { pub homepage: Url, diff --git a/apps/recorder/src/models/bangumi.rs b/apps/recorder/src/models/bangumi.rs index 30e14bc..f3688d1 100644 --- a/apps/recorder/src/models/bangumi.rs +++ b/apps/recorder/src/models/bangumi.rs @@ -152,7 +152,10 @@ impl ActiveModel { season_raw: ActiveValue::Set(season_raw), fansub: ActiveValue::Set(Some(meta.fansub)), poster_link: ActiveValue::Set(poster_link), - origin_poster_link: ActiveValue::Set(meta.origin_poster_src.map(|src| src.to_string())), + origin_poster_link: ActiveValue::Set( + meta.origin_poster_src + .map(|src| src[url::Position::BeforePath..].to_string()), + ), homepage: ActiveValue::Set(Some(meta.homepage.to_string())), rss_link: ActiveValue::Set(Some(rss_url.to_string())), ..Default::default()