mirror of
https://github.com/lifegpc/pixiv_downloader.git
synced 2026-06-06 05:49:01 +08:00
add ensure_link_ascii to Description Parser
This commit is contained in:
@@ -39,7 +39,7 @@ modular-bitfield = "0.11"
|
||||
multipart = { features = ["server"], git = 'https://github.com/lifegpc/multipart', optional = true, default-features = false }
|
||||
openssl = { version = "0.10", optional = true }
|
||||
parse-size = "1"
|
||||
percent-encoding = { version = "*", optional = true }
|
||||
percent-encoding = "*"
|
||||
proc_macros = { path = "proc_macros" }
|
||||
rand = { version = "0", optional = true }
|
||||
regex = "1"
|
||||
@@ -66,7 +66,7 @@ db_all = ["db", "db_sqlite"]
|
||||
db_sqlite = ["rusqlite"]
|
||||
docker = []
|
||||
exif = ["bindgen", "c_fixed_string", "cmake", "link-cplusplus", "utf16string"]
|
||||
server = ["async-trait", "base64", "db", "hex", "hyper", "multipart", "openssl", "serde_json", "rand", "serde_urlencoded", "percent-encoding"]
|
||||
server = ["async-trait", "base64", "db", "hex", "hyper", "multipart", "openssl", "serde_json", "rand", "serde_urlencoded"]
|
||||
ugoira = ["avdict", "bindgen", "cmake", "link-cplusplus"]
|
||||
|
||||
[patch.crates-io]
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::ext::rw_lock::GetRwLock;
|
||||
use lazy_static::lazy_static;
|
||||
use log::LevelFilter;
|
||||
use log4rs::append::console::ConsoleAppender;
|
||||
use log4rs::config::{Appender, Logger, Root};
|
||||
@@ -8,6 +7,7 @@ use log4rs::{init_config, Config, Handle};
|
||||
use std::sync::RwLock;
|
||||
|
||||
lazy_static! {
|
||||
#[doc(hidden)]
|
||||
static ref HANDLE: RwLock<Option<Handle>> = RwLock::new(None);
|
||||
}
|
||||
|
||||
|
||||
@@ -4,9 +4,33 @@ use crate::pixiv_link::remove_track;
|
||||
use html5ever::tendril::TendrilSink;
|
||||
use html5ever::{parse_document, ParseOpts};
|
||||
use markup5ever_rcdom::{Node, NodeData, RcDom};
|
||||
use percent_encoding::{percent_encode, AsciiSet, NON_ALPHANUMERIC};
|
||||
use std::collections::HashMap;
|
||||
use std::default::Default;
|
||||
|
||||
const URLENCODE: &AsciiSet = &NON_ALPHANUMERIC
|
||||
.remove(b':')
|
||||
.remove(b'/')
|
||||
.remove(b'?')
|
||||
.remove(b'#')
|
||||
.remove(b'[')
|
||||
.remove(b']')
|
||||
.remove(b'@')
|
||||
.remove(b'!')
|
||||
.remove(b'$')
|
||||
.remove(b'&')
|
||||
.remove(b'\'')
|
||||
.remove(b'(')
|
||||
.remove(b')')
|
||||
.remove(b'*')
|
||||
.remove(b'+')
|
||||
.remove(b',')
|
||||
.remove(b';')
|
||||
.remove(b'=')
|
||||
.remove(b'%')
|
||||
.remove(b' ')
|
||||
.remove(b'.');
|
||||
|
||||
/// Reprent a node
|
||||
#[derive(Debug)]
|
||||
struct DescriptionNode {
|
||||
@@ -87,12 +111,14 @@ impl DescriptionNode {
|
||||
s
|
||||
}
|
||||
|
||||
pub fn to_link(&self) -> String {
|
||||
format!(
|
||||
"[{}]({})",
|
||||
self.data.as_str(),
|
||||
self.attrs.get("href").unwrap()
|
||||
)
|
||||
pub fn to_link(&self, ascii: bool) -> String {
|
||||
let href = self.attrs.get("href").unwrap();
|
||||
let href = if ascii {
|
||||
percent_encode(href.as_bytes(), URLENCODE).to_string()
|
||||
} else {
|
||||
href.clone()
|
||||
};
|
||||
format!("[{}]({})", self.data.as_str(), href)
|
||||
}
|
||||
|
||||
pub fn to_paragraph(&self) -> String {
|
||||
@@ -106,14 +132,41 @@ impl DescriptionNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DescriptionParserBuilder {
|
||||
/// Markdown mode
|
||||
md_mode: bool,
|
||||
/// Ensure link is ASCII
|
||||
_ensure_link_ascii: bool,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl DescriptionParserBuilder {
|
||||
pub fn new(md_mode: bool) -> Self {
|
||||
Self {
|
||||
md_mode,
|
||||
_ensure_link_ascii: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Ensure link is ASCII
|
||||
pub fn ensure_link_ascii(mut self) -> Self {
|
||||
self._ensure_link_ascii = true;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> DescriptionParser {
|
||||
DescriptionParser::from(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// A simple HTML parser to parse description HTML
|
||||
pub struct DescriptionParser {
|
||||
/// Current nodes stack
|
||||
nodes: Vec<DescriptionNode>,
|
||||
/// Output
|
||||
pub data: String,
|
||||
/// Markdown mode
|
||||
md_mode: bool,
|
||||
/// Options
|
||||
opts: DescriptionParserBuilder,
|
||||
}
|
||||
|
||||
impl DescriptionParser {
|
||||
@@ -121,7 +174,7 @@ impl DescriptionParser {
|
||||
Self {
|
||||
nodes: Vec::new(),
|
||||
data: String::from(""),
|
||||
md_mode,
|
||||
opts: DescriptionParserBuilder::new(md_mode),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,7 +193,7 @@ impl DescriptionParser {
|
||||
if tag == "script" || tag == "style" {
|
||||
return;
|
||||
} else if tag == "br" {
|
||||
let br = if self.md_mode { " \n" } else { "\n" };
|
||||
let br = if self.opts.md_mode { " \n" } else { "\n" };
|
||||
if self.nodes.len() == 0 {
|
||||
self.data += br;
|
||||
} else {
|
||||
@@ -165,28 +218,28 @@ impl DescriptionParser {
|
||||
}
|
||||
let node = self.nodes.pop().unwrap();
|
||||
let mut is_paragraph = false;
|
||||
let s = if node.is_link(self.md_mode) {
|
||||
node.to_link()
|
||||
} else if self.md_mode && node.is_headline() {
|
||||
let s = if node.is_link(self.opts.md_mode) {
|
||||
node.to_link(self.opts._ensure_link_ascii)
|
||||
} else if self.opts.md_mode && node.is_headline() {
|
||||
node.to_headline()
|
||||
} else if self.md_mode && node.is_paragraph() {
|
||||
} else if self.opts.md_mode && node.is_paragraph() {
|
||||
is_paragraph = true;
|
||||
node.to_paragraph()
|
||||
} else if self.md_mode && node.is_strong() {
|
||||
} else if self.opts.md_mode && node.is_strong() {
|
||||
node.to_strong()
|
||||
} else if self.md_mode && node.is_em() {
|
||||
} else if self.opts.md_mode && node.is_em() {
|
||||
node.to_em()
|
||||
} else {
|
||||
node.data
|
||||
};
|
||||
if self.nodes.len() == 0 {
|
||||
while self.md_mode && is_paragraph && !self.data.ends_with("\n\n") {
|
||||
while self.opts.md_mode && is_paragraph && !self.data.ends_with("\n\n") {
|
||||
self.data += "\n";
|
||||
}
|
||||
self.data += s.as_str();
|
||||
} else {
|
||||
let n = self.nodes.last_mut().unwrap();
|
||||
while self.md_mode && is_paragraph && !n.data.ends_with("\n\n") {
|
||||
while self.opts.md_mode && is_paragraph && !n.data.ends_with("\n\n") {
|
||||
n.data += "\n";
|
||||
}
|
||||
n.data += s.as_str();
|
||||
@@ -222,6 +275,21 @@ impl DescriptionParser {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn builder(md_mode: bool) -> DescriptionParserBuilder {
|
||||
DescriptionParserBuilder::new(md_mode)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DescriptionParserBuilder> for DescriptionParser {
|
||||
fn from(opts: DescriptionParserBuilder) -> Self {
|
||||
Self {
|
||||
nodes: Vec::new(),
|
||||
data: String::from(""),
|
||||
opts,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_description<S: AsRef<str> + ?Sized>(desc: &S) -> Option<String> {
|
||||
@@ -285,3 +353,14 @@ fn test_convert_description_to_md() {
|
||||
convert_description_to_md("<h1>Head</h1>D<p>He</p><p>Be</p>t<em><strong>e</strong>s</em>t<p><a href=\"/jump.php?https%3A%2F%2Fa.com\">Link</a></p>").unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ensure_link_ascii() {
|
||||
let mut p = DescriptionParser::builder(true).ensure_link_ascii().build();
|
||||
p.parse("<a href=\"https://test:[email protected]/ad/测试?p=1&t=*\">测试<a>")
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
String::from("[测试](https://test:[email protected]/ad/%E6%B5%8B%E8%AF%95?p=1&t=*)"),
|
||||
p.data
|
||||
);
|
||||
}
|
||||
|
||||
@@ -393,7 +393,7 @@ impl<'a> RunContext<'a> {
|
||||
}
|
||||
}
|
||||
if let Some(desc) = self.desc() {
|
||||
let mut p = DescriptionParser::new(true);
|
||||
let mut p = DescriptionParser::builder(true).ensure_link_ascii().build();
|
||||
p.parse(desc)?;
|
||||
while !text.ends_with("\n\n") {
|
||||
text.push_str("\n");
|
||||
@@ -473,7 +473,7 @@ impl<'a> RunContext<'a> {
|
||||
}
|
||||
}
|
||||
if let Some(desc) = self.desc() {
|
||||
let mut p = DescriptionParser::new(true);
|
||||
let mut p = DescriptionParser::builder(true).ensure_link_ascii().build();
|
||||
p.parse(desc)?;
|
||||
while !text.ends_with("\n\n") {
|
||||
text.push_str("\n");
|
||||
|
||||
Reference in New Issue
Block a user