Use html5ever to replace html_parser

This commit is contained in:
2023-10-31 10:33:23 +00:00
committed by GitHub
parent afac6553d9
commit c7cdfac896
5 changed files with 212 additions and 175 deletions

249
Cargo.lock generated
View File

@@ -192,15 +192,6 @@ version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635"
[[package]]
name = "block-buffer"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
dependencies = [
"generic-array",
]
[[package]]
name = "brotli"
version = "3.4.0"
@@ -361,15 +352,6 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
[[package]]
name = "cpufeatures"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
dependencies = [
"libc",
]
[[package]]
name = "crc32fast"
version = "1.3.2"
@@ -379,16 +361,6 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "crypto-common"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
dependencies = [
"generic-array",
"typenum",
]
[[package]]
name = "dateparser"
version = "0.2.0"
@@ -431,16 +403,6 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c877555693c14d2f84191cfd3ad8582790fc52b5e2274b40b59cf5f5cea25c7"
[[package]]
name = "digest"
version = "0.10.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [
"block-buffer",
"crypto-common",
]
[[package]]
name = "either"
version = "1.9.0"
@@ -627,6 +589,16 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "futures-channel"
version = "0.3.28"
@@ -688,16 +660,6 @@ dependencies = [
"slab",
]
[[package]]
name = "generic-array"
version = "0.14.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
dependencies = [
"typenum",
"version_check",
]
[[package]]
name = "getopts"
version = "0.2.21"
@@ -806,17 +768,17 @@ dependencies = [
]
[[package]]
name = "html_parser"
version = "0.6.3"
name = "html5ever"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec016cabcf7c9c48f9d5fdc6b03f273585bfce640a0f47a69552039e92b1959a"
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [
"pest",
"pest_derive",
"serde",
"serde_derive",
"serde_json",
"thiserror",
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
@@ -1156,6 +1118,38 @@ dependencies = [
"winapi",
]
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
dependencies = [
"log",
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "markup5ever_rcdom"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2"
dependencies = [
"html5ever",
"markup5ever",
"tendril",
"xml5ever",
]
[[package]]
name = "md5"
version = "0.7.0"
@@ -1264,6 +1258,12 @@ dependencies = [
"tempfile",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
[[package]]
name = "nom"
version = "7.1.3"
@@ -1486,48 +1486,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
[[package]]
name = "pest"
version = "2.7.4"
name = "phf"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c022f1e7b65d6a24c0dbbd5fb344c66881bc01f3e5ae74a1c8100f2f985d98a4"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
dependencies = [
"memchr",
"thiserror",
"ucd-trie",
"phf_shared",
]
[[package]]
name = "pest_derive"
version = "2.7.4"
name = "phf_codegen"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35513f630d46400a977c4cb58f78e1bfbe01434316e60c37d27b9ad6139c66d8"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
dependencies = [
"pest",
"pest_generator",
"phf_generator",
"phf_shared",
]
[[package]]
name = "pest_generator"
version = "2.7.4"
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc9fc1b9e7057baba189b5c626e2d6f40681ae5b6eb064dc7c7834101ec8123a"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"pest",
"pest_meta",
"proc-macro2",
"quote",
"syn 2.0.37",
"phf_shared",
"rand",
]
[[package]]
name = "pest_meta"
version = "2.7.4"
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1df74e9e7ec4053ceb980e7c0c8bd3594e977fde1af91daba9c928e8e8c6708d"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"once_cell",
"pest",
"sha2",
"siphasher",
]
[[package]]
@@ -1564,7 +1557,7 @@ dependencies = [
"getopts",
"gettext",
"hex",
"html_parser",
"html5ever",
"http",
"http-content-range",
"hyper",
@@ -1577,6 +1570,7 @@ dependencies = [
"link-cplusplus",
"log",
"log4rs",
"markup5ever_rcdom",
"md5",
"modular-bitfield",
"multipart",
@@ -1615,6 +1609,12 @@ version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro-crate"
version = "1.3.1"
@@ -1931,23 +1931,18 @@ dependencies = [
"yaml-rust",
]
[[package]]
name = "sha2"
version = "0.10.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]]
name = "shlex"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380"
[[package]]
name = "siphasher"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "slab"
version = "0.4.9"
@@ -1989,6 +1984,32 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "string_cache"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
dependencies = [
"new_debug_unreachable",
"once_cell",
"parking_lot",
"phf_shared",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
]
[[package]]
name = "syn"
version = "1.0.109"
@@ -2023,6 +2044,17 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]]
name = "thiserror"
version = "1.0.49"
@@ -2200,18 +2232,6 @@ dependencies = [
"unsafe-any-ors",
]
[[package]]
name = "typenum"
version = "1.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
[[package]]
name = "ucd-trie"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9"
[[package]]
name = "unchecked-index"
version = "0.2.2"
@@ -2286,6 +2306,12 @@ version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "110352d4e9076c67839003c7788d8604e24dcded13e0b375af3efaa8cf468517"
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf16string"
version = "0.2.0"
@@ -2605,6 +2631,17 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "xml5ever"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650"
dependencies = [
"log",
"mac",
"markup5ever",
]
[[package]]
name = "yaml-rust"
version = "0.4.5"

View File

@@ -21,7 +21,7 @@ futures-util = "0.3"
getopts = "0.2"
gettext = "0.4"
hex = { version = "0.4", optional = true }
html_parser = "0.6.3"
html5ever = "*"
http = "0.2"
http-content-range = "0.1"
hyper = { version="0.14", features = ["server"], optional = true }
@@ -33,6 +33,7 @@ json = "0.12"
lazy_static = "1.4"
log = "*"
log4rs = "1"
markup5ever_rcdom = "0.2"
md5 = "0.7"
modular-bitfield = "0.11"
multipart = { features = ["server"], git = 'https://github.com/lifegpc/multipart', optional = true, default-features = false }

View File

@@ -2,7 +2,7 @@ use crate::ext::rw_lock::GetRwLock;
use lazy_static::lazy_static;
use log::LevelFilter;
use log4rs::append::console::ConsoleAppender;
use log4rs::config::{Appender, Root};
use log4rs::config::{Appender, Logger, Root};
use log4rs::encode::pattern::PatternEncoder;
use log4rs::{init_config, Config, Handle};
use std::sync::RwLock;
@@ -17,6 +17,10 @@ pub fn init_with_level(level: LevelFilter) {
.build();
let config = Config::builder()
.appender(Appender::builder().build("stdout", Box::new(stdout)))
.logger(Logger::builder().build("html5ever::tree_builder", LevelFilter::Warn))
.logger(Logger::builder().build("html5ever::tokenizer", LevelFilter::Warn))
.logger(Logger::builder().build("html5ever::tokenizer::char_ref", LevelFilter::Warn))
.logger(Logger::builder().build("reqwest::connect", LevelFilter::Warn))
.build(Root::builder().appender("stdout").build(level))
.unwrap();
let mut h = HANDLE.get_mut();

View File

@@ -1,8 +1,9 @@
use crate::error::PixivDownloaderError;
use crate::gettext;
use crate::pixiv_link::remove_track;
use html_parser::Dom;
use html_parser::Node;
use html5ever::tendril::TendrilSink;
use html5ever::{parse_document, ParseOpts};
use markup5ever_rcdom::{Node, NodeData, RcDom};
use std::collections::HashMap;
use std::default::Default;
@@ -125,17 +126,17 @@ impl DescriptionParser {
}
pub fn iter(&mut self, node: &Node) {
match node {
Node::Comment(_) => {}
Node::Text(s) => {
match &node.data {
NodeData::Text { contents } => {
let s = contents.borrow().to_string();
if self.nodes.len() == 0 {
self.data += s;
self.data += &s;
} else {
self.nodes.last_mut().unwrap().data += s;
self.nodes.last_mut().unwrap().data += &s;
}
}
Node::Element(e) => {
let tag = e.name.as_str();
NodeData::Element { name, attrs, .. } => {
let tag = name.local.to_string();
if tag == "script" || tag == "style" {
return;
} else if tag == "br" {
@@ -147,20 +148,19 @@ impl DescriptionParser {
}
return;
}
let mut node = DescriptionNode::default();
node.tag = tag.to_string();
let mut nod = DescriptionNode::default();
nod.tag = tag.to_string();
let attrs = attrs.borrow();
if tag == "a" {
let href = e.attributes.get("href");
let href = attrs.iter().find(|k| k.name.local.to_string() == "href");
if href.is_some() {
let href = href.unwrap();
if href.is_some() {
let link = remove_track(href.as_ref().unwrap());
node.add_attr("href", link.as_str());
}
let href = href.unwrap().value.to_string();
let link = remove_track(href);
nod.add_attr("href", link.as_str());
}
}
self.nodes.push(node);
for n in e.children.iter() {
self.nodes.push(nod);
for n in node.children.borrow().iter() {
self.iter(n);
}
let node = self.nodes.pop().unwrap();
@@ -192,24 +192,24 @@ impl DescriptionParser {
n.data += s.as_str();
}
}
_ => {}
}
}
pub fn parse<S: AsRef<str> + ?Sized>(&mut self, desc: &S) -> Result<(), PixivDownloaderError> {
let r = Dom::parse(desc.as_ref());
if r.is_err() {
return Err(format!("{} {}", gettext("Failed to parse HTML:"), r.unwrap_err()).into());
}
let dom = r.unwrap();
if dom.errors.len() > 0 {
let mut s = String::from(gettext("Some errors occured during parsing:"));
for i in dom.errors.iter() {
s += "\n";
s += i;
let opts = ParseOpts::default();
let r = parse_document(RcDom::default(), opts)
.from_utf8()
.read_from(&mut desc.as_ref().as_bytes());
let dom = match r {
Ok(d) => d,
Err(e) => {
return Err(
format!("{} {}", gettext("Failed to parse HTML:"), e.to_string()).into(),
)
}
return Err(s.into());
}
for node in dom.children.iter() {
};
for node in dom.document.children.borrow().iter() {
self.iter(node)
}
if self.nodes.len() != 0 {
@@ -254,7 +254,7 @@ fn test_parse_description() {
parse_description("a <a href=\"https://a.com\">https://a.com</a>")
);
assert_eq!(
Some(String::from("a [a\n[bc](a.com)d](b.com)\ndata")),
Some(String::from("a [a\n](b.com)[bc](a.com)d\ndata")),
parse_description("a <a href=\"b.com\">a<br/><a href=\"a.com\">bc</a>d</a><br>data")
);
assert_eq!(

View File

@@ -1,7 +1,8 @@
use crate::gettext;
use html_parser::Dom;
use html_parser::Node;
use html5ever::tendril::TendrilSink;
use html5ever::{parse_document, ParseOpts};
use json::JsonValue;
use markup5ever_rcdom::{Node, NodeData, RcDom};
use std::default::Default;
pub struct MetaDataParser {
@@ -20,38 +21,34 @@ impl MetaDataParser {
}
fn iter(&mut self, node: &Node) -> bool {
match node {
Node::Element(e) => {
if e.name == "meta" {
let name = e.attributes.get("name");
match &node.data {
NodeData::Element { name, attrs, .. } => {
if name.local.as_ref() == "meta" {
let attrs = attrs.borrow();
let name = attrs.iter().find(|a| a.name.local.as_ref() == "name");
if name.is_none() {
return false;
}
let name = name.unwrap();
if name.is_none() {
if name.value.as_ref() != self.key.as_str() {
return false;
}
if name.as_ref().unwrap() != self.key.as_str() {
return false;
}
if e.id.is_none() {
let id = attrs.iter().find(|a| a.name.local.as_ref() == "id");
if id.is_none() {
return false;
}
let id = id.unwrap();
let mkey = format!("meta-{}", self.key.as_str());
if e.id.as_ref().unwrap() != mkey.as_str()
&& e.id.as_ref().unwrap() != self.key.as_str()
{
let id = id.value.as_ref();
if id != mkey.as_str() && id != self.key.as_str() {
return false;
}
let c = e.attributes.get("content");
let c = attrs.iter().find(|a| a.name.local.as_ref() == "content");
if c.is_none() {
return false;
}
let c = c.unwrap();
if c.is_none() {
return false;
}
let r = json::parse(c.as_ref().unwrap());
let r = json::parse(c.value.as_ref());
if r.is_err() {
log::error!("{} {}", gettext("Failed to parse JSON:"), r.unwrap_err());
return false;
@@ -59,7 +56,7 @@ impl MetaDataParser {
self.value = Some(r.unwrap());
true
} else {
for c in e.children.iter() {
for c in node.children.borrow().iter() {
if self.iter(c) {
return true;
}
@@ -67,25 +64,23 @@ impl MetaDataParser {
false
}
}
Node::Comment(_) => false,
Node::Text(_) => false,
_ => false,
}
}
pub fn parse(&mut self, context: &str) -> bool {
let r = Dom::parse(context);
if r.is_err() {
log::error!("{} {}", gettext("Failed to parse HTML:"), r.unwrap_err());
return false;
}
let dom = r.unwrap();
if dom.errors.len() > 0 {
log::error!("{}", gettext("Some errors occured during parsing:"));
for i in dom.errors.iter() {
log::error!("{}", i);
let opts = ParseOpts::default();
let r = parse_document(RcDom::default(), opts)
.from_utf8()
.read_from(&mut context.as_bytes());
let dom = match r {
Ok(d) => d,
Err(e) => {
log::error!("{} {}", gettext("Failed to parse HTML:"), e.to_string());
return false;
}
}
for n in dom.children.iter() {
};
for n in dom.document.children.borrow().iter() {
if self.iter(n) {
return true;
}