add ensure_link_ascii to Description Parser

This commit is contained in:
2023-11-01 12:35:50 +00:00
committed by GitHub
parent 061a3160d6
commit 9b4ac20b8d
4 changed files with 102 additions and 23 deletions

View File

@@ -39,7 +39,7 @@ modular-bitfield = "0.11"
multipart = { features = ["server"], git = 'https://github.com/lifegpc/multipart', optional = true, default-features = false }
openssl = { version = "0.10", optional = true }
parse-size = "1"
percent-encoding = { version = "*", optional = true }
percent-encoding = "*"
proc_macros = { path = "proc_macros" }
rand = { version = "0", optional = true }
regex = "1"
@@ -66,7 +66,7 @@ db_all = ["db", "db_sqlite"]
db_sqlite = ["rusqlite"]
docker = []
exif = ["bindgen", "c_fixed_string", "cmake", "link-cplusplus", "utf16string"]
server = ["async-trait", "base64", "db", "hex", "hyper", "multipart", "openssl", "serde_json", "rand", "serde_urlencoded", "percent-encoding"]
server = ["async-trait", "base64", "db", "hex", "hyper", "multipart", "openssl", "serde_json", "rand", "serde_urlencoded"]
ugoira = ["avdict", "bindgen", "cmake", "link-cplusplus"]
[patch.crates-io]

View File

@@ -1,5 +1,4 @@
use crate::ext::rw_lock::GetRwLock;
use lazy_static::lazy_static;
use log::LevelFilter;
use log4rs::append::console::ConsoleAppender;
use log4rs::config::{Appender, Logger, Root};
@@ -8,6 +7,7 @@ use log4rs::{init_config, Config, Handle};
use std::sync::RwLock;
lazy_static! {
#[doc(hidden)]
static ref HANDLE: RwLock<Option<Handle>> = RwLock::new(None);
}

View File

@@ -4,9 +4,33 @@ use crate::pixiv_link::remove_track;
use html5ever::tendril::TendrilSink;
use html5ever::{parse_document, ParseOpts};
use markup5ever_rcdom::{Node, NodeData, RcDom};
use percent_encoding::{percent_encode, AsciiSet, NON_ALPHANUMERIC};
use std::collections::HashMap;
use std::default::Default;
const URLENCODE: &AsciiSet = &NON_ALPHANUMERIC
.remove(b':')
.remove(b'/')
.remove(b'?')
.remove(b'#')
.remove(b'[')
.remove(b']')
.remove(b'@')
.remove(b'!')
.remove(b'$')
.remove(b'&')
.remove(b'\'')
.remove(b'(')
.remove(b')')
.remove(b'*')
.remove(b'+')
.remove(b',')
.remove(b';')
.remove(b'=')
.remove(b'%')
.remove(b' ')
.remove(b'.');
/// Reprent a node
#[derive(Debug)]
struct DescriptionNode {
@@ -87,12 +111,14 @@ impl DescriptionNode {
s
}
pub fn to_link(&self) -> String {
format!(
"[{}]({})",
self.data.as_str(),
self.attrs.get("href").unwrap()
)
pub fn to_link(&self, ascii: bool) -> String {
let href = self.attrs.get("href").unwrap();
let href = if ascii {
percent_encode(href.as_bytes(), URLENCODE).to_string()
} else {
href.clone()
};
format!("[{}]({})", self.data.as_str(), href)
}
pub fn to_paragraph(&self) -> String {
@@ -106,14 +132,41 @@ impl DescriptionNode {
}
}
pub struct DescriptionParserBuilder {
/// Markdown mode
md_mode: bool,
/// Ensure link is ASCII
_ensure_link_ascii: bool,
}
#[allow(dead_code)]
impl DescriptionParserBuilder {
pub fn new(md_mode: bool) -> Self {
Self {
md_mode,
_ensure_link_ascii: false,
}
}
/// Ensure link is ASCII
pub fn ensure_link_ascii(mut self) -> Self {
self._ensure_link_ascii = true;
self
}
pub fn build(self) -> DescriptionParser {
DescriptionParser::from(self)
}
}
/// A simple HTML parser to parse description HTML
pub struct DescriptionParser {
/// Current nodes stack
nodes: Vec<DescriptionNode>,
/// Output
pub data: String,
/// Markdown mode
md_mode: bool,
/// Options
opts: DescriptionParserBuilder,
}
impl DescriptionParser {
@@ -121,7 +174,7 @@ impl DescriptionParser {
Self {
nodes: Vec::new(),
data: String::from(""),
md_mode,
opts: DescriptionParserBuilder::new(md_mode),
}
}
@@ -140,7 +193,7 @@ impl DescriptionParser {
if tag == "script" || tag == "style" {
return;
} else if tag == "br" {
let br = if self.md_mode { " \n" } else { "\n" };
let br = if self.opts.md_mode { " \n" } else { "\n" };
if self.nodes.len() == 0 {
self.data += br;
} else {
@@ -165,28 +218,28 @@ impl DescriptionParser {
}
let node = self.nodes.pop().unwrap();
let mut is_paragraph = false;
let s = if node.is_link(self.md_mode) {
node.to_link()
} else if self.md_mode && node.is_headline() {
let s = if node.is_link(self.opts.md_mode) {
node.to_link(self.opts._ensure_link_ascii)
} else if self.opts.md_mode && node.is_headline() {
node.to_headline()
} else if self.md_mode && node.is_paragraph() {
} else if self.opts.md_mode && node.is_paragraph() {
is_paragraph = true;
node.to_paragraph()
} else if self.md_mode && node.is_strong() {
} else if self.opts.md_mode && node.is_strong() {
node.to_strong()
} else if self.md_mode && node.is_em() {
} else if self.opts.md_mode && node.is_em() {
node.to_em()
} else {
node.data
};
if self.nodes.len() == 0 {
while self.md_mode && is_paragraph && !self.data.ends_with("\n\n") {
while self.opts.md_mode && is_paragraph && !self.data.ends_with("\n\n") {
self.data += "\n";
}
self.data += s.as_str();
} else {
let n = self.nodes.last_mut().unwrap();
while self.md_mode && is_paragraph && !n.data.ends_with("\n\n") {
while self.opts.md_mode && is_paragraph && !n.data.ends_with("\n\n") {
n.data += "\n";
}
n.data += s.as_str();
@@ -222,6 +275,21 @@ impl DescriptionParser {
}
Ok(())
}
#[allow(dead_code)]
pub fn builder(md_mode: bool) -> DescriptionParserBuilder {
DescriptionParserBuilder::new(md_mode)
}
}
impl From<DescriptionParserBuilder> for DescriptionParser {
fn from(opts: DescriptionParserBuilder) -> Self {
Self {
nodes: Vec::new(),
data: String::from(""),
opts,
}
}
}
pub fn parse_description<S: AsRef<str> + ?Sized>(desc: &S) -> Option<String> {
@@ -285,3 +353,14 @@ fn test_convert_description_to_md() {
convert_description_to_md("<h1>Head</h1>D<p>He</p><p>Be</p>t<em><strong>e</strong>s</em>t<p><a href=\"/jump.php?https%3A%2F%2Fa.com\">Link</a></p>").unwrap()
);
}
#[test]
fn test_ensure_link_ascii() {
let mut p = DescriptionParser::builder(true).ensure_link_ascii().build();
p.parse("<a href=\"https://test:[email protected]/ad/测试?p=1&t=*\">测试<a>")
.unwrap();
assert_eq!(
String::from("[测试](https://test:[email protected]/ad/%E6%B5%8B%E8%AF%95?p=1&t=*)"),
p.data
);
}

View File

@@ -393,7 +393,7 @@ impl<'a> RunContext<'a> {
}
}
if let Some(desc) = self.desc() {
let mut p = DescriptionParser::new(true);
let mut p = DescriptionParser::builder(true).ensure_link_ascii().build();
p.parse(desc)?;
while !text.ends_with("\n\n") {
text.push_str("\n");
@@ -473,7 +473,7 @@ impl<'a> RunContext<'a> {
}
}
if let Some(desc) = self.desc() {
let mut p = DescriptionParser::new(true);
let mut p = DescriptionParser::builder(true).ensure_link_ascii().build();
p.parse(desc)?;
while !text.ends_with("\n\n") {
text.push_str("\n");