diff --git a/Cargo.lock b/Cargo.lock index 7fa9374..f2cd614 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -192,15 +192,6 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - [[package]] name = "brotli" version = "3.4.0" @@ -361,15 +352,6 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" -[[package]] -name = "cpufeatures" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" -dependencies = [ - "libc", -] - [[package]] name = "crc32fast" version = "1.3.2" @@ -379,16 +361,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "typenum", -] - [[package]] name = "dateparser" version = "0.2.0" @@ -431,16 +403,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c877555693c14d2f84191cfd3ad8582790fc52b5e2274b40b59cf5f5cea25c7" -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", -] - [[package]] name = "either" version = "1.9.0" @@ -627,6 +589,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures-channel" version = "0.3.28" @@ -688,16 +660,6 @@ dependencies = [ "slab", ] -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - [[package]] name = "getopts" version = "0.2.21" @@ -806,17 +768,17 @@ dependencies = [ ] [[package]] -name = "html_parser" -version = "0.6.3" +name = "html5ever" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec016cabcf7c9c48f9d5fdc6b03f273585bfce640a0f47a69552039e92b1959a" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" dependencies = [ - "pest", - "pest_derive", - "serde", - "serde_derive", - "serde_json", - "thiserror", + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 1.0.109", ] [[package]] @@ -1156,6 +1118,38 @@ dependencies = [ "winapi", ] +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + [[package]] name = "md5" version = "0.7.0" @@ -1264,6 +1258,12 @@ dependencies = [ "tempfile", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + [[package]] name = "nom" version = "7.1.3" @@ -1486,48 +1486,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" [[package]] -name = "pest" -version = "2.7.4" +name = "phf" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c022f1e7b65d6a24c0dbbd5fb344c66881bc01f3e5ae74a1c8100f2f985d98a4" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" dependencies = [ - "memchr", - "thiserror", - "ucd-trie", + "phf_shared", ] [[package]] -name = "pest_derive" -version = "2.7.4" +name = "phf_codegen" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35513f630d46400a977c4cb58f78e1bfbe01434316e60c37d27b9ad6139c66d8" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" dependencies = [ - "pest", - "pest_generator", + "phf_generator", + "phf_shared", ] [[package]] -name = "pest_generator" -version = "2.7.4" +name = "phf_generator" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc9fc1b9e7057baba189b5c626e2d6f40681ae5b6eb064dc7c7834101ec8123a" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" dependencies = [ - "pest", - "pest_meta", - "proc-macro2", - "quote", - "syn 2.0.37", + "phf_shared", + "rand", ] [[package]] -name = "pest_meta" -version = "2.7.4" +name = "phf_shared" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1df74e9e7ec4053ceb980e7c0c8bd3594e977fde1af91daba9c928e8e8c6708d" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" dependencies = [ - "once_cell", - "pest", - "sha2", + "siphasher", ] [[package]] @@ -1564,7 +1557,7 @@ dependencies = [ "getopts", "gettext", "hex", - "html_parser", + "html5ever", "http", "http-content-range", "hyper", @@ -1577,6 +1570,7 @@ dependencies = [ "link-cplusplus", "log", "log4rs", + "markup5ever_rcdom", "md5", "modular-bitfield", "multipart", @@ -1615,6 +1609,12 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "proc-macro-crate" version = "1.3.1" @@ -1931,23 +1931,18 @@ dependencies = [ "yaml-rust", ] -[[package]] -name = "sha2" -version = "0.10.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - [[package]] name = "shlex" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "slab" version = "0.4.9" @@ -1989,6 +1984,32 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + [[package]] name = "syn" version = "1.0.109" @@ -2023,6 +2044,17 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "thiserror" version = "1.0.49" @@ -2200,18 +2232,6 @@ dependencies = [ "unsafe-any-ors", ] -[[package]] -name = "typenum" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" - -[[package]] -name = "ucd-trie" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9" - [[package]] name = "unchecked-index" version = "0.2.2" @@ -2286,6 +2306,12 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "110352d4e9076c67839003c7788d8604e24dcded13e0b375af3efaa8cf468517" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf16string" version = "0.2.0" @@ -2605,6 +2631,17 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "xml5ever" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" +dependencies = [ + "log", + "mac", + "markup5ever", +] + [[package]] name = "yaml-rust" version = "0.4.5" diff --git a/Cargo.toml b/Cargo.toml index d055b25..4a75e50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ futures-util = "0.3" getopts = "0.2" gettext = "0.4" hex = { version = "0.4", optional = true } -html_parser = "0.6.3" +html5ever = "*" http = "0.2" http-content-range = "0.1" hyper = { version="0.14", features = ["server"], optional = true } @@ -33,6 +33,7 @@ json = "0.12" lazy_static = "1.4" log = "*" log4rs = "1" +markup5ever_rcdom = "0.2" md5 = "0.7" modular-bitfield = "0.11" multipart = { features = ["server"], git = 'https://github.com/lifegpc/multipart', optional = true, default-features = false } diff --git a/src/log_cfg.rs b/src/log_cfg.rs index a9bca3b..29edec4 100644 --- a/src/log_cfg.rs +++ b/src/log_cfg.rs @@ -2,7 +2,7 @@ use crate::ext::rw_lock::GetRwLock; use lazy_static::lazy_static; use log::LevelFilter; use log4rs::append::console::ConsoleAppender; -use log4rs::config::{Appender, Root}; +use log4rs::config::{Appender, Logger, Root}; use log4rs::encode::pattern::PatternEncoder; use log4rs::{init_config, Config, Handle}; use std::sync::RwLock; @@ -17,6 +17,10 @@ pub fn init_with_level(level: LevelFilter) { .build(); let config = Config::builder() .appender(Appender::builder().build("stdout", Box::new(stdout))) + .logger(Logger::builder().build("html5ever::tree_builder", LevelFilter::Warn)) + .logger(Logger::builder().build("html5ever::tokenizer", LevelFilter::Warn)) + .logger(Logger::builder().build("html5ever::tokenizer::char_ref", LevelFilter::Warn)) + .logger(Logger::builder().build("reqwest::connect", LevelFilter::Warn)) .build(Root::builder().appender("stdout").build(level)) .unwrap(); let mut h = HANDLE.get_mut(); diff --git a/src/parser/description.rs b/src/parser/description.rs index c74eb86..8c2dd41 100644 --- a/src/parser/description.rs +++ b/src/parser/description.rs @@ -1,8 +1,9 @@ use crate::error::PixivDownloaderError; use crate::gettext; use crate::pixiv_link::remove_track; -use html_parser::Dom; -use html_parser::Node; +use html5ever::tendril::TendrilSink; +use html5ever::{parse_document, ParseOpts}; +use markup5ever_rcdom::{Node, NodeData, RcDom}; use std::collections::HashMap; use std::default::Default; @@ -125,17 +126,17 @@ impl DescriptionParser { } pub fn iter(&mut self, node: &Node) { - match node { - Node::Comment(_) => {} - Node::Text(s) => { + match &node.data { + NodeData::Text { contents } => { + let s = contents.borrow().to_string(); if self.nodes.len() == 0 { - self.data += s; + self.data += &s; } else { - self.nodes.last_mut().unwrap().data += s; + self.nodes.last_mut().unwrap().data += &s; } } - Node::Element(e) => { - let tag = e.name.as_str(); + NodeData::Element { name, attrs, .. } => { + let tag = name.local.to_string(); if tag == "script" || tag == "style" { return; } else if tag == "br" { @@ -147,20 +148,19 @@ impl DescriptionParser { } return; } - let mut node = DescriptionNode::default(); - node.tag = tag.to_string(); + let mut nod = DescriptionNode::default(); + nod.tag = tag.to_string(); + let attrs = attrs.borrow(); if tag == "a" { - let href = e.attributes.get("href"); + let href = attrs.iter().find(|k| k.name.local.to_string() == "href"); if href.is_some() { - let href = href.unwrap(); - if href.is_some() { - let link = remove_track(href.as_ref().unwrap()); - node.add_attr("href", link.as_str()); - } + let href = href.unwrap().value.to_string(); + let link = remove_track(href); + nod.add_attr("href", link.as_str()); } } - self.nodes.push(node); - for n in e.children.iter() { + self.nodes.push(nod); + for n in node.children.borrow().iter() { self.iter(n); } let node = self.nodes.pop().unwrap(); @@ -192,24 +192,24 @@ impl DescriptionParser { n.data += s.as_str(); } } + _ => {} } } pub fn parse + ?Sized>(&mut self, desc: &S) -> Result<(), PixivDownloaderError> { - let r = Dom::parse(desc.as_ref()); - if r.is_err() { - return Err(format!("{} {}", gettext("Failed to parse HTML:"), r.unwrap_err()).into()); - } - let dom = r.unwrap(); - if dom.errors.len() > 0 { - let mut s = String::from(gettext("Some errors occured during parsing:")); - for i in dom.errors.iter() { - s += "\n"; - s += i; + let opts = ParseOpts::default(); + let r = parse_document(RcDom::default(), opts) + .from_utf8() + .read_from(&mut desc.as_ref().as_bytes()); + let dom = match r { + Ok(d) => d, + Err(e) => { + return Err( + format!("{} {}", gettext("Failed to parse HTML:"), e.to_string()).into(), + ) } - return Err(s.into()); - } - for node in dom.children.iter() { + }; + for node in dom.document.children.borrow().iter() { self.iter(node) } if self.nodes.len() != 0 { @@ -254,7 +254,7 @@ fn test_parse_description() { parse_description("a https://a.com") ); assert_eq!( - Some(String::from("a [a\n[bc](a.com)d](b.com)\ndata")), + Some(String::from("a [a\n](b.com)[bc](a.com)d\ndata")), parse_description("a a
bcd
data") ); assert_eq!( diff --git a/src/parser/metadata.rs b/src/parser/metadata.rs index 4e8464e..2420480 100644 --- a/src/parser/metadata.rs +++ b/src/parser/metadata.rs @@ -1,7 +1,8 @@ use crate::gettext; -use html_parser::Dom; -use html_parser::Node; +use html5ever::tendril::TendrilSink; +use html5ever::{parse_document, ParseOpts}; use json::JsonValue; +use markup5ever_rcdom::{Node, NodeData, RcDom}; use std::default::Default; pub struct MetaDataParser { @@ -20,38 +21,34 @@ impl MetaDataParser { } fn iter(&mut self, node: &Node) -> bool { - match node { - Node::Element(e) => { - if e.name == "meta" { - let name = e.attributes.get("name"); + match &node.data { + NodeData::Element { name, attrs, .. } => { + if name.local.as_ref() == "meta" { + let attrs = attrs.borrow(); + let name = attrs.iter().find(|a| a.name.local.as_ref() == "name"); if name.is_none() { return false; } let name = name.unwrap(); - if name.is_none() { + if name.value.as_ref() != self.key.as_str() { return false; } - if name.as_ref().unwrap() != self.key.as_str() { - return false; - } - if e.id.is_none() { + let id = attrs.iter().find(|a| a.name.local.as_ref() == "id"); + if id.is_none() { return false; } + let id = id.unwrap(); let mkey = format!("meta-{}", self.key.as_str()); - if e.id.as_ref().unwrap() != mkey.as_str() - && e.id.as_ref().unwrap() != self.key.as_str() - { + let id = id.value.as_ref(); + if id != mkey.as_str() && id != self.key.as_str() { return false; } - let c = e.attributes.get("content"); + let c = attrs.iter().find(|a| a.name.local.as_ref() == "content"); if c.is_none() { return false; } let c = c.unwrap(); - if c.is_none() { - return false; - } - let r = json::parse(c.as_ref().unwrap()); + let r = json::parse(c.value.as_ref()); if r.is_err() { log::error!("{} {}", gettext("Failed to parse JSON:"), r.unwrap_err()); return false; @@ -59,7 +56,7 @@ impl MetaDataParser { self.value = Some(r.unwrap()); true } else { - for c in e.children.iter() { + for c in node.children.borrow().iter() { if self.iter(c) { return true; } @@ -67,25 +64,23 @@ impl MetaDataParser { false } } - Node::Comment(_) => false, - Node::Text(_) => false, + _ => false, } } pub fn parse(&mut self, context: &str) -> bool { - let r = Dom::parse(context); - if r.is_err() { - log::error!("{} {}", gettext("Failed to parse HTML:"), r.unwrap_err()); - return false; - } - let dom = r.unwrap(); - if dom.errors.len() > 0 { - log::error!("{}", gettext("Some errors occured during parsing:")); - for i in dom.errors.iter() { - log::error!("{}", i); + let opts = ParseOpts::default(); + let r = parse_document(RcDom::default(), opts) + .from_utf8() + .read_from(&mut context.as_bytes()); + let dom = match r { + Ok(d) => d, + Err(e) => { + log::error!("{} {}", gettext("Failed to parse HTML:"), e.to_string()); + return false; } - } - for n in dom.children.iter() { + }; + for n in dom.document.children.borrow().iter() { if self.iter(n) { return true; }