fix: fix mikan web extractors

This commit is contained in:
2025-02-25 01:02:38 +08:00
parent 09565bd827
commit 5bc5d98823
26 changed files with 9537 additions and 659 deletions

View File

@@ -1,3 +1,11 @@
pub mod styles;
pub use styles::parse_style_attr;
use html_escape::decode_html_entities;
use itertools::Itertools;
use scraper::ElementRef;
pub use styles::{extract_background_image_src_from_style_attr, extract_style_from_attr};
pub fn extract_inner_text_from_element_ref(el: ElementRef<'_>) -> String {
let raw_text = el.text().collect_vec().join(",");
decode_html_entities(&raw_text).trim().to_string()
}

View File

@@ -1,6 +1,45 @@
use lightningcss::declaration::DeclarationBlock;
use lightningcss::{
declaration::DeclarationBlock, properties::Property, values::image::Image as CSSImage,
};
use url::Url;
pub fn parse_style_attr(style_attr: &str) -> Option<DeclarationBlock> {
use crate::extract::media::extract_image_src_from_str;
pub fn extract_style_from_attr(style_attr: &str) -> Option<DeclarationBlock> {
let result = DeclarationBlock::parse_string(style_attr, Default::default()).ok()?;
Some(result)
}
pub fn extract_background_image_src_from_style_attr(
style_attr: &str,
base_url: &Url,
) -> Option<Url> {
extract_style_from_attr(style_attr).and_then(|style| {
style.iter().find_map(|(prop, _)| {
match prop {
Property::BackgroundImage(images) => {
for img in images {
if let CSSImage::Url(path) = img {
if let Some(url) = extract_image_src_from_str(path.url.trim(), base_url)
{
return Some(url);
}
}
}
}
Property::Background(backgrounds) => {
for bg in backgrounds {
if let CSSImage::Url(path) = &bg.image {
if let Some(url) = extract_image_src_from_str(path.url.trim(), base_url)
{
return Some(url);
}
}
}
}
_ => {}
}
None
})
})
}