From dd2921f2dd3129db2b9454bd448919f476d4c603 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Mon, 13 Apr 2026 12:05:32 +0800 Subject: [PATCH] Impl A thread safe DOM for xml5ever (#12) --- Cargo.lock | 48 +-- Cargo.toml | 6 +- src/ext/mod.rs | 2 +- src/ext/rcdom.rs | 99 +++-- src/scripts/entis_gls/srcxml.rs | 26 +- src/scripts/ex_hibit/arc/grp.rs | 14 +- src/utils/html5ever_arcdom.rs | 642 ++++++++++++++++++++++++++++++++ src/utils/mod.rs | 2 + 8 files changed, 744 insertions(+), 95 deletions(-) create mode 100644 src/utils/html5ever_arcdom.rs diff --git a/Cargo.lock b/Cargo.lock index 956d945..3bd5471 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -487,6 +487,19 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + [[package]] name = "crossbeam-channel" version = "0.5.15" @@ -515,6 +528,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -990,16 +1012,6 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" -[[package]] -name = "html5ever" -version = "0.38.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1054432bae2f14e0061e33d23402fbaa67a921d319d56adc6bcf887ddad1cbc2" -dependencies = [ - "log", - "markup5ever", -] - [[package]] name = "hybrid-array" version = "0.4.10" @@ -1394,18 +1406,6 @@ dependencies = [ "web_atoms", ] -[[package]] -name = "markup5ever_rcdom" -version = "0.38.0+unofficial" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "333171ccdf66e915257740d44e38ea5b1b19ce7b45d33cc35cb6f118fbd981ff" -dependencies = [ - "html5ever", - "markup5ever", - "tendril", - "xml5ever", -] - [[package]] name = "md5" version = "0.8.0" @@ -1486,6 +1486,7 @@ dependencies = [ "cbc", "clap 4.6.0", "crc32fast", + "crossbeam", "csv", "ctrlc", "digest-io", @@ -1503,7 +1504,6 @@ dependencies = [ "libtlg-rs", "lz4", "markup5ever", - "markup5ever_rcdom", "md5", "memchr", "mozjpeg", @@ -1523,6 +1523,7 @@ dependencies = [ "sha1", "sha2", "stylua", + "tendril", "unicode-segmentation", "url", "utf16string", @@ -2146,7 +2147,6 @@ dependencies = [ "parking_lot", "phf_shared", "precomputed-hash", - "serde", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 573e696..026ef55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ bytes = { version = "1.11", optional = true } cbc = { version = "0.2", optional = true } clap = { version = "4.5", features = ["derive"] } crc32fast = { version = "1.5", optional = true } +crossbeam = { version = "0.8", optional = true } csv = "1.3" ctrlc = "3.4" digest-io = { version = "0.1", optional = true } @@ -36,7 +37,6 @@ libflac-sys = { version = "0.3", optional = true } libtlg-rs = { version = "0.2", optional = true, features = ["encode"] } lz4 = { version = "1.28", optional = true } markup5ever = { version = "0.38", optional = true } -markup5ever_rcdom = { version = "0.38", optional = true } md5 = { version = "0.8", optional = true } memchr = { version = "2.7", optional = true } mozjpeg = { version = "0.10", optional = true } @@ -55,6 +55,7 @@ serde_yaml_ng = "0.10" sha1 = { version = "0.11", optional = true } sha2 = { version = "0.11", optional = true } stylua = { version = "2.1", optional = true, default-features = false} +tendril = { version = "0.5", optional = true } unicode-segmentation = "1.12" url = { version = "2.5", optional = true } utf16string = "0.2" @@ -87,7 +88,7 @@ circus-arc = ["circus"] circus-audio = ["circus", "flate2", "int-enum", "lossless-audio"] circus-img = ["circus", "image", "flate2", "zstd"] emote-img = ["base64", "block_compression", "emote-psb", "image", "json", "libtlg-rs", "url", "utils-psd"] -entis-gls = ["xml5ever", "markup5ever", "markup5ever_rcdom", "int-enum"] +entis-gls = ["xml5ever", "int-enum"] escude = ["int-enum"] escude-arc = ["escude", "rand", "utils-bit-stream"] ex-hibit = [] @@ -122,6 +123,7 @@ audio-flac = ["libflac-sys", "utils-pcm"] unstable = ["msg_tool_macro/unstable", "msg_tool_xp3data?/unstable"] jieba = ["jieba-rs"] emote-psb = ["dep:emote-psb", "adler", "lz4"] +xml5ever = ["dep:xml5ever", "markup5ever", "tendril", "crossbeam"] # utils feature utils-bit-stream = [] utils-blowfish = ["byteorder"] diff --git a/src/ext/mod.rs b/src/ext/mod.rs index fbe0026..0dfb90b 100644 --- a/src/ext/mod.rs +++ b/src/ext/mod.rs @@ -7,6 +7,6 @@ pub mod mutex; pub mod path; #[cfg(feature = "emote-psb")] pub mod psb; -#[cfg(feature = "markup5ever_rcdom")] +#[cfg(feature = "xml5ever")] pub mod rcdom; pub mod vec; diff --git a/src/ext/rcdom.rs b/src/ext/rcdom.rs index 89916e2..3bd7939 100644 --- a/src/ext/rcdom.rs +++ b/src/ext/rcdom.rs @@ -1,9 +1,7 @@ //! Extensions for markup5ever_rcdom crate. +use crate::utils::html5ever_arcdom::{AtomicAttribute, Node, NodeData}; use anyhow::Result; -use markup5ever::Attribute; -use markup5ever_rcdom::{Node, NodeData}; -use std::cell::{Ref, RefCell}; -use std::rc::{Rc, Weak}; +use std::sync::{Arc, Mutex, Weak}; /// Extensions for [Node] pub trait NodeExt { @@ -38,15 +36,15 @@ pub trait NodeExt { /// Extensions for [Rc] pub trait RcNodeExt { /// Pushes a child node to the current node. - fn push_child(&self, child: Rc) -> Result<()>; + fn push_child(&self, child: Arc) -> Result<()>; /// Create a deep clone - fn deep_clone(&self, parent: Option>) -> Result>; + fn deep_clone(&self, parent: Option>) -> Result>; /// Create a deep clone with modification of data. fn deep_clone_with_modify Result<()>>( &self, parent: Option>, modify: F, - ) -> Result>; + ) -> Result>; /// Changes a child node at the given index by modifying its data. /// /// Deep clones are needed. @@ -80,7 +78,7 @@ impl NodeExt for Node { fn is_processing_instruction + ?Sized>(&self, name: &S) -> bool { match &self.data { - NodeData::ProcessingInstruction { target, .. } => target.as_ref() == name.as_ref(), + NodeData::ProcessingInstruction { target, .. } => target == name.as_ref(), _ => false, } } @@ -88,9 +86,8 @@ impl NodeExt for Node { fn element_attr_keys<'a>(&'a self) -> Result + 'a>> { match &self.data { NodeData::Element { attrs, .. } => { - let borrowed = attrs.try_borrow()?; - let iter = AttrKeyIter { borrowed, pos: 0 }; - Ok(Box::new(iter)) + let attrs = attrs.lock().unwrap(); + Ok(Box::new(KeyIter { attrs, index: 0 })) } _ => Ok(Box::new(std::iter::empty())), } @@ -99,15 +96,11 @@ impl NodeExt for Node { fn get_attr_value + ?Sized>(&self, name: &S) -> Result> { match &self.data { NodeData::Element { attrs, .. } => { - let borrowed = attrs.try_borrow()?; - if let Some(attr) = borrowed + let attrs = attrs.lock().unwrap(); + Ok(attrs .iter() .find(|a| a.name.local.as_ref() == name.as_ref()) - { - Ok(Some(attr.value.to_string())) - } else { - Ok(None) - } + .map(|a| a.value.to_string())) } _ => Ok(None), } @@ -120,14 +113,14 @@ impl NodeExt for Node { ) -> Result<()> { match &self.data { NodeData::Element { attrs, .. } => { - let mut borrowed = attrs.try_borrow_mut()?; + let mut borrowed = attrs.lock().unwrap(); if let Some(attr) = borrowed .iter_mut() .find(|a| a.name.local.as_ref() == name.as_ref()) { attr.value = value.as_ref().into(); } else { - borrowed.push(Attribute { + borrowed.push(AtomicAttribute { name: markup5ever::QualName::new( None, markup5ever::Namespace::default(), @@ -143,23 +136,23 @@ impl NodeExt for Node { } } -impl RcNodeExt for Rc { - fn push_child(&self, child: Rc) -> Result<()> { - child.parent.replace(Some(Rc::downgrade(self))); - self.children.try_borrow_mut()?.push(child); +impl RcNodeExt for Arc { + fn push_child(&self, child: Arc) -> Result<()> { + child.parent.store(Some(Arc::downgrade(self))); + self.children.lock().unwrap().push(child); Ok(()) } - fn deep_clone(&self, parent: Option>) -> Result> { + fn deep_clone(&self, parent: Option>) -> Result> { let data = self.data.clone2()?; let node = Node { data, - children: RefCell::new(Vec::new()), + children: Mutex::new(Vec::new()), parent: parent.into(), }; - let node = Rc::new(node); - for child in self.children.try_borrow()?.iter() { - let cloned_child = child.deep_clone(Some(Rc::downgrade(&node)))?; + let node = Arc::new(node); + for child in self.children.lock().unwrap().iter() { + let cloned_child = child.deep_clone(Some(Arc::downgrade(&node)))?; node.push_child(cloned_child)?; } Ok(node) @@ -169,17 +162,17 @@ impl RcNodeExt for Rc { &self, parent: Option>, modify: F, - ) -> Result> { + ) -> Result> { let mut data = self.data.clone2()?; modify(&mut data)?; let node = Node { data, - children: RefCell::new(Vec::new()), + children: Mutex::new(Vec::new()), parent: parent.into(), }; - let node = Rc::new(node); - for child in self.children.try_borrow()?.iter() { - let cloned_child = child.deep_clone(Some(Rc::downgrade(&node)))?; + let node = Arc::new(node); + for child in self.children.lock().unwrap().iter() { + let cloned_child = child.deep_clone(Some(Arc::downgrade(&node)))?; node.push_child(cloned_child)?; } Ok(node) @@ -190,13 +183,13 @@ impl RcNodeExt for Rc { index: usize, modify: F, ) -> Result<()> { - let mut children = self.children.try_borrow_mut()?; + let mut children = self.children.lock().unwrap(); if index >= children.len() { return Err(anyhow::anyhow!("Index out of bounds")); } let child = children.remove(index); child.parent.take(); - let nchild = child.deep_clone_with_modify(Some(Rc::downgrade(self)), modify)?; + let nchild = child.deep_clone_with_modify(Some(Arc::downgrade(self)), modify)?; children.insert(index, nchild); Ok(()) } @@ -219,7 +212,7 @@ impl NodeDataExt for NodeData { system_id: system_id.clone(), }, NodeData::Text { contents } => NodeData::Text { - contents: contents.clone(), + contents: Arc::new(Mutex::new(contents.lock().unwrap().clone())), }, NodeData::ProcessingInstruction { target, contents } => { NodeData::ProcessingInstruction { @@ -235,22 +228,22 @@ impl NodeDataExt for NodeData { } => { let name = name.clone(); let mut nattrs = Vec::new(); - for attr in attrs.try_borrow()?.iter() { - nattrs.push(Attribute { + for attr in attrs.lock().unwrap().iter() { + nattrs.push(AtomicAttribute { name: attr.name.clone(), value: attr.value.clone(), }); } - let attrs = RefCell::new(nattrs); - let template = match template_contents.try_borrow()?.as_ref() { + let attrs = Mutex::new(nattrs); + let template = match template_contents.lock().unwrap().as_ref() { Some(tc) => Some(tc.deep_clone(None)?), None => None, }; - let template_contents = RefCell::new(template); + let template_contents = Mutex::new(template); NodeData::Element { name, - attrs, - template_contents, + attrs: Arc::new(attrs), + template_contents: Arc::new(template_contents), mathml_annotation_xml_integration_point: mathml_annotation_xml_integration_point.clone(), } @@ -272,21 +265,21 @@ impl NodeDataExt for NodeData { } } -struct AttrKeyIter<'a> { - borrowed: Ref<'a, Vec>, - pos: usize, +struct KeyIter<'a> { + attrs: std::sync::MutexGuard<'a, Vec>, + index: usize, } -impl<'a> Iterator for AttrKeyIter<'a> { +impl<'a> Iterator for KeyIter<'a> { type Item = String; fn next(&mut self) -> Option { - if self.pos < self.borrowed.len() { - let attr = &self.borrowed[self.pos]; - self.pos += 1; - Some(attr.name.local.to_string()) - } else { + if self.index >= self.attrs.len() { None + } else { + let key = self.attrs[self.index].name.local.as_ref(); + self.index += 1; + Some(key.to_string()) } } } diff --git a/src/scripts/entis_gls/srcxml.rs b/src/scripts/entis_gls/srcxml.rs index 256230b..cf9bf03 100644 --- a/src/scripts/entis_gls/srcxml.rs +++ b/src/scripts/entis_gls/srcxml.rs @@ -4,8 +4,8 @@ use crate::ext::rcdom::*; use crate::scripts::base::*; use crate::types::*; use crate::utils::encoding::*; +use crate::utils::html5ever_arcdom::{ArcDom, Handle, SerializableHandle}; use anyhow::Result; -use markup5ever_rcdom::{Handle, RcDom, SerializableHandle}; use xml5ever::driver::parse_document; use xml5ever::serialize::serialize; use xml5ever::tendril::TendrilSink; @@ -62,11 +62,11 @@ impl SrcXmlScript { /// * `config` - Additional configuration options. pub fn new(buf: Vec, encoding: Encoding, config: &ExtraConfig) -> Result { let decoded = decode_to_string(encoding, &buf, false)?; - let dom = parse_document(RcDom::default(), Default::default()) + let dom = parse_document(ArcDom::default(), Default::default()) .from_utf8() .one(decoded.as_bytes()); { - let error = dom.errors.try_borrow()?; + let error = dom.errors.lock().unwrap(); for e in error.iter() { eprintln!("WARN: Error parsing srcxml: {}", e); crate::COUNTER.inc_warning(); @@ -91,11 +91,11 @@ impl Script for SrcXmlScript { fn extract_messages(&self) -> Result> { let mut messages = Vec::new(); let mut lang = self.lang.clone(); - for i in self.handle.children.try_borrow()?.iter() { + for i in self.handle.children.lock().unwrap().iter() { if i.is_element("xscript") { - for code in i.children.try_borrow()?.iter() { + for code in i.children.lock().unwrap().iter() { if code.is_element("code") { - for ins in code.children.try_borrow()?.iter() { + for ins in code.children.lock().unwrap().iter() { if ins.is_element("msg") { let lan = match lang.as_ref() { Some(l) => l.as_str(), @@ -130,7 +130,7 @@ impl Script for SrcXmlScript { .ok_or(anyhow::anyhow!("text not found"))?; messages.push(Message { name, message }) } else if ins.is_element("select") { - for menu in ins.children.try_borrow()?.iter() { + for menu in ins.children.lock().unwrap().iter() { if menu.is_element("menu") { let lan = match lang.as_ref() { Some(l) => l.as_str(), @@ -179,8 +179,8 @@ impl Script for SrcXmlScript { ) -> Result<()> { let root = self.handle.deep_clone(None)?; if !encoding.is_utf8() { - let len = root.children.try_borrow()?.len(); - if len > 0 && root.children.try_borrow()?[0].is_processing_instruction("xml") { + let len = root.children.lock().unwrap().len(); + if len > 0 && root.children.lock().unwrap()[0].is_processing_instruction("xml") { root.change_child(0, |data| { data.set_processing_instruction_content("version=\"1.0\"") })?; @@ -189,11 +189,11 @@ impl Script for SrcXmlScript { let mut lang = self.lang.clone(); let mut mess = messages.iter(); let mut mes = mess.next(); - for i in root.children.try_borrow()?.iter() { + for i in root.children.lock().unwrap().iter() { if i.is_element("xscript") { - for code in i.children.try_borrow()?.iter() { + for code in i.children.lock().unwrap().iter() { if code.is_element("code") { - for ins in code.children.try_borrow()?.iter() { + for ins in code.children.lock().unwrap().iter() { if ins.is_element("msg") { let m = match mes { Some(m) => m, @@ -247,7 +247,7 @@ impl Script for SrcXmlScript { ins.set_attr_value(text_ref, &message)?; mes = mess.next(); } else if ins.is_element("select") { - for menu in ins.children.try_borrow()?.iter() { + for menu in ins.children.lock().unwrap().iter() { if menu.is_element("menu") { let m = match mes { Some(m) => m, diff --git a/src/scripts/ex_hibit/arc/grp.rs b/src/scripts/ex_hibit/arc/grp.rs index 6e0f468..2c29ab8 100644 --- a/src/scripts/ex_hibit/arc/grp.rs +++ b/src/scripts/ex_hibit/arc/grp.rs @@ -329,9 +329,19 @@ fn parse_name_info(name: &str) -> Result { )); } let arc_num = std::str::from_utf8(digits) - .with_context(|| format!("Failed to parse archive number from '{:#?}' (digits '{:#?}').", name, digits))? + .with_context(|| { + format!( + "Failed to parse archive number from '{:#?}' (digits '{:#?}').", + name, digits + ) + })? .parse::() - .with_context(|| format!("Failed to parse archive number from '{:#?}' (digits '{:#?}').", name, digits))?; + .with_context(|| { + format!( + "Failed to parse archive number from '{:#?}' (digits '{:#?}').", + name, digits + ) + })?; Ok(NameInfo { digits_offset: 3, digits_len: digits.len(), diff --git a/src/utils/html5ever_arcdom.rs b/src/utils/html5ever_arcdom.rs new file mode 100644 index 0000000..99af5ad --- /dev/null +++ b/src/utils/html5ever_arcdom.rs @@ -0,0 +1,642 @@ +use std::borrow::Cow; +use std::collections::{HashSet, VecDeque}; +use std::fmt; +use std::io; +use std::mem; +use std::sync::{Arc, Mutex, Weak}; + +use crossbeam::atomic::AtomicCell; +use tendril::StrTendril; + +use markup5ever::Attribute; +use markup5ever::ExpandedName; +use markup5ever::QualName; +use markup5ever::interface::tree_builder; +use markup5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; +use markup5ever::serialize::TraversalScope; +use markup5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode}; +use markup5ever::serialize::{Serialize, Serializer}; +use xml5ever::interface::ElemName; +use xml5ever::local_name; + +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)] +pub struct AtomicAttribute { + pub name: QualName, + pub value: String, +} + +impl From for AtomicAttribute { + fn from(attr: Attribute) -> Self { + AtomicAttribute { + name: attr.name, + value: attr.value.to_string(), + } + } +} + +#[derive(Debug, Clone)] +pub enum NodeData { + /// The `Document` itself - the root node of a HTML document. + Document, + + /// A `DOCTYPE` with name, public id, and system id. See + /// [document type declaration on wikipedia][dtd wiki]. + /// + /// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration + Doctype { + name: String, + public_id: String, + system_id: String, + }, + + /// A text node. + Text { contents: Arc> }, + + /// A comment. + Comment { contents: String }, + + /// An element with attributes. + Element { + name: QualName, + attrs: Arc>>, + + /// For HTML \ elements, the [template contents]. + /// + /// [template contents]: https://html.spec.whatwg.org/multipage/#template-contents + template_contents: Arc>>, + + /// Whether the node is a [HTML integration point]. + /// + /// [HTML integration point]: https://html.spec.whatwg.org/multipage/#html-integration-point + mathml_annotation_xml_integration_point: bool, + }, + + /// A Processing instruction. + ProcessingInstruction { target: String, contents: String }, +} + +/// A DOM node. +pub struct Node { + /// Parent node. + pub parent: AtomicCell>, + /// Child nodes of this node. + pub children: Mutex>, + /// Represents this node's data. + pub data: NodeData, +} + +impl Node { + /// Create a new node from its contents + pub fn new(data: NodeData) -> Arc { + Arc::new(Node { + data, + parent: AtomicCell::new(None), + children: Mutex::new(Vec::new()), + }) + } + + /// + fn get_option_element_nearest_ancestor_select(&self) -> Option> { + // Step 1. Let ancestorOptgroup be null. + // NOTE: The algorithm doesn't actually need the value, so a boolean is enough. + let mut did_see_ancestor_optgroup = false; + + // Step 2. For each ancestor of option's ancestors, in reverse tree order: + let mut current = self.parent().and_then(|parent| parent.upgrade())?; + loop { + if let NodeData::Element { name, .. } = ¤t.data { + // Step 2.1 If ancestor is a datalist, hr, or option element, then return null. + if matches!( + name.local_name(), + &local_name!("datalist") | &local_name!("hr") | &local_name!("option") + ) { + return None; + } + + // Step 2.2 If ancestor is an optgroup element: + if name.local_name() == &local_name!("optgroup") { + // Step 2.2.1 If ancestorOptgroup is not null, then return null. + if did_see_ancestor_optgroup { + return None; + } + + // Step 2.2.2 Set ancestorOptgroup to ancestor. + did_see_ancestor_optgroup = true; + } + + // Step 2.3 If ancestor is a select, then return ancestor. + if name.local_name() == &local_name!("select") { + return Some(current); + } + }; + + // Move on to the next ancestor + let Some(next_ancestor) = current.parent().and_then(|parent| parent.upgrade()) else { + break; + }; + current = next_ancestor; + } + + // Step 3. Return null. + None + } + + fn parent(&self) -> Option> { + let parent = self.parent.take(); + self.parent.store(parent.clone()); + parent + } + + /// + fn get_a_selects_enabled_selectedcontent(&self) -> Option> { + // Step 1. If select has the multiple attribute, then return null. + let NodeData::Element { name, attrs, .. } = &self.data else { + panic!("Trying to get selectedcontent of non-element"); + }; + debug_assert_eq!(name.local_name(), &local_name!("select")); + if attrs + .lock() + .unwrap() + .iter() + .any(|attribute| attribute.name.local == local_name!("multiple")) + { + return None; + } + + // Step 2. Let selectedcontent be the first selectedcontent element descendant of select in tree order + // if any such element exists; otherwise return null. + // FIXME: This does not visit the nodes in tree order + let mut remaining = VecDeque::default(); + remaining.extend(self.children.lock().unwrap().iter().cloned()); + let mut selectedcontent = None; + while let Some(node) = remaining.pop_front() { + remaining.extend(node.children.lock().unwrap().iter().cloned()); + + let NodeData::Element { name, .. } = &self.data else { + continue; + }; + if name.local_name() == &local_name!("selectedcontent") { + selectedcontent = Some(node); + break; + } + } + let selectedcontent = selectedcontent?; + + // Step 3. If selectedcontent's disabled is true, then return null. + // FIXME: This step is unimplemented for now to reduce complexity. + + // Step 4. Return selectedcontent. + Some(selectedcontent) + } + + /// + fn clone_an_option_into_selectedcontent(&self, selectedcontent: Arc) { + // Step 1. Let documentFragment be a new DocumentFragment whose node document is option's node document. + // NOTE: We just remember the children of said fragment, thats good enough. + let mut document_fragment = Vec::new(); + + // Step 2. For each child of option's children: + for child in self.children.lock().unwrap().iter() { + // Step 2.1 Let childClone be the result of running clone given child with subtree set to true. + let child_clone = child.clone_with_subtree(); + + // Step 2.2 Append childClone to documentFragment. + document_fragment.push(child_clone); + } + + // Step 3. Replace all with documentFragment within selectedcontent. + *selectedcontent.children.lock().unwrap() = document_fragment; + } + + /// Clones the node and all of its descendants, returning a handle to the new subtree. + /// + /// This function will run into infinite recursion when the DOM tree contains cycles and it makes + /// no attempts to guard against that. + fn clone_with_subtree(&self) -> Arc { + let children = self + .children + .lock() + .unwrap() + .iter() + .map(|child| child.clone_with_subtree()) + .collect(); + Arc::new(Self { + parent: AtomicCell::new(self.parent()), + data: self.data.clone(), + children: Mutex::new(children), + }) + } +} + +impl fmt::Debug for Node { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + fmt.debug_struct("Node") + .field("data", &self.data) + .field("children", &self.children) + .finish() + } +} + +/// Reference to a DOM node. +pub type Handle = Arc; + +/// Weak reference to a DOM node, used for parent pointers. +pub type WeakHandle = Weak; + +/// Append a parentless node to another nodes' children +fn append(new_parent: &Handle, child: Handle) { + let previous_parent = child.parent.swap(Some(Arc::downgrade(new_parent))); + // Invariant: child cannot have existing parent + assert!(previous_parent.is_none()); + new_parent.children.lock().unwrap().push(child); +} + +/// If the node has a parent, get it and this node's position in its children +fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> { + if let Some(weak) = target.parent.take() { + let parent = weak.upgrade().expect("dangling weak pointer"); + target.parent.store(Some(weak)); + let i = match parent + .children + .lock() + .unwrap() + .iter() + .enumerate() + .find(|&(_, child)| Arc::ptr_eq(child, target)) + { + Some((i, _)) => i, + None => panic!("have parent but couldn't find in parent's children!"), + }; + Some((parent, i)) + } else { + None + } +} + +fn append_to_existing_text(prev: &Handle, text: &str) -> bool { + match prev.data { + NodeData::Text { ref contents } => { + contents.lock().unwrap().push_str(text); + true + } + _ => false, + } +} + +fn remove_from_parent(target: &Handle) { + if let Some((parent, i)) = get_parent_and_index(target) { + parent.children.lock().unwrap().remove(i); + target.parent.store(None); + } +} + +/// The DOM itself; the result of parsing. +pub struct ArcDom { + /// The `Document` itself. + pub document: Handle, + + /// Errors that occurred during parsing. + pub errors: Mutex>>, + + /// The document's quirks mode. + pub quirks_mode: AtomicCell, +} + +impl TreeSink for ArcDom { + type Output = Self; + fn finish(self) -> Self { + self + } + + type Handle = Handle; + + type ElemName<'a> + = ExpandedName<'a> + where + Self: 'a; + + fn parse_error(&self, msg: Cow<'static, str>) { + self.errors.lock().unwrap().push(msg); + } + + fn get_document(&self) -> Handle { + self.document.clone() + } + + fn get_template_contents(&self, target: &Handle) -> Handle { + if let NodeData::Element { + ref template_contents, + .. + } = target.data + { + template_contents + .lock() + .unwrap() + .as_ref() + .expect("not a template element!") + .clone() + } else { + panic!("not a template element!") + } + } + + fn set_quirks_mode(&self, mode: QuirksMode) { + self.quirks_mode.store(mode); + } + + fn same_node(&self, x: &Handle, y: &Handle) -> bool { + Arc::ptr_eq(x, y) + } + + fn elem_name<'a>(&self, target: &'a Handle) -> ExpandedName<'a> { + match target.data { + NodeData::Element { ref name, .. } => name.expanded(), + _ => panic!("not an element!"), + } + } + + fn create_element(&self, name: QualName, attrs: Vec, flags: ElementFlags) -> Handle { + Node::new(NodeData::Element { + name, + attrs: Arc::new(Mutex::new( + attrs.into_iter().map(AtomicAttribute::from).collect(), + )), + template_contents: Arc::new(Mutex::new(if flags.template { + Some(Node::new(NodeData::Document)) + } else { + None + })), + mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point, + }) + } + + fn create_comment(&self, text: StrTendril) -> Handle { + Node::new(NodeData::Comment { + contents: text.to_string(), + }) + } + + fn create_pi(&self, target: StrTendril, data: StrTendril) -> Handle { + Node::new(NodeData::ProcessingInstruction { + target: target.to_string(), + contents: data.to_string(), + }) + } + + fn append(&self, parent: &Handle, child: NodeOrText) { + // Append to an existing Text node if we have one. + if let NodeOrText::AppendText(text) = &child { + if let Some(h) = parent.children.lock().unwrap().last() { + if append_to_existing_text(h, text) { + return; + } + } + } + + append( + parent, + match child { + NodeOrText::AppendText(text) => Node::new(NodeData::Text { + contents: Arc::new(Mutex::new(text.to_string())), + }), + NodeOrText::AppendNode(node) => node, + }, + ); + } + + fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText) { + let (parent, i) = get_parent_and_index(sibling) + .expect("append_before_sibling called on node without parent"); + + let child = match (child, i) { + // No previous node. + (NodeOrText::AppendText(text), 0) => Node::new(NodeData::Text { + contents: Arc::new(Mutex::new(text.to_string())), + }), + + // Look for a text node before the insertion point. + (NodeOrText::AppendText(text), i) => { + let children = parent.children.lock().unwrap(); + let prev = &children[i - 1]; + if append_to_existing_text(prev, &text) { + return; + } + Node::new(NodeData::Text { + contents: Arc::new(Mutex::new(text.to_string())), + }) + } + + // The tree builder promises we won't have a text node after + // the insertion point. + + // Any other kind of node. + (NodeOrText::AppendNode(node), _) => node, + }; + + remove_from_parent(&child); + + child.parent.store(Some(Arc::downgrade(&parent))); + parent.children.lock().unwrap().insert(i, child); + } + + fn append_based_on_parent_node( + &self, + element: &Self::Handle, + prev_element: &Self::Handle, + child: NodeOrText, + ) { + let parent = element.parent.take(); + let has_parent = parent.is_some(); + element.parent.store(parent); + + if has_parent { + self.append_before_sibling(element, child); + } else { + self.append(prev_element, child); + } + } + + fn append_doctype_to_document( + &self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ) { + append( + &self.document, + Node::new(NodeData::Doctype { + name: name.to_string(), + public_id: public_id.to_string(), + system_id: system_id.to_string(), + }), + ); + } + + fn add_attrs_if_missing(&self, target: &Handle, attrs: Vec) { + let mut existing = if let NodeData::Element { ref attrs, .. } = target.data { + attrs.lock().unwrap() + } else { + panic!("not an element") + }; + + let existing_names = existing + .iter() + .map(|e| e.name.clone()) + .collect::>(); + existing.extend( + attrs + .into_iter() + .filter(|attr| !existing_names.contains(&attr.name)) + .map(AtomicAttribute::from), + ); + } + + fn remove_from_parent(&self, target: &Handle) { + remove_from_parent(target); + } + + fn reparent_children(&self, node: &Handle, new_parent: &Handle) { + let mut children = node.children.lock().unwrap(); + let mut new_children = new_parent.children.lock().unwrap(); + for child in children.iter() { + let previous_parent = child.parent.swap(Some(Arc::downgrade(new_parent))); + assert!(Arc::ptr_eq( + node, + &previous_parent.unwrap().upgrade().expect("dangling weak") + )) + } + new_children.extend(mem::take(&mut *children)); + } + + fn is_mathml_annotation_xml_integration_point(&self, target: &Handle) -> bool { + if let NodeData::Element { + mathml_annotation_xml_integration_point, + .. + } = target.data + { + mathml_annotation_xml_integration_point + } else { + panic!("not an element!") + } + } + + fn maybe_clone_an_option_into_selectedcontent(&self, option: &Self::Handle) { + let NodeData::Element { name, attrs, .. } = &option.data else { + panic!("\"maybe clone an option into selectedcontent\" called with non-element node"); + }; + debug_assert_eq!(name.local_name(), &local_name!("option")); + + // Step 1. Let select be option's option element nearest ancestor select. + let select = option.get_option_element_nearest_ancestor_select(); + + // Step 2. If all of the following conditions are true: + // * select is not null; + // * option's selectedness is true; and + // * select's enabled selectedcontent is not null, + // then run clone an option into a selectedcontent given option and select's enabled selectedcontent. + if let Some(selectedcontent) = + select.and_then(|select| select.get_a_selects_enabled_selectedcontent()) + { + if attrs + .lock() + .unwrap() + .iter() + .any(|attribute| attribute.name.local == local_name!("selected")) + { + option.clone_an_option_into_selectedcontent(selectedcontent); + } + } + } +} + +impl Default for ArcDom { + fn default() -> ArcDom { + ArcDom { + document: Node::new(NodeData::Document), + errors: Default::default(), + quirks_mode: AtomicCell::new(tree_builder::NoQuirks), + } + } +} + +enum SerializeOp { + Open(Handle), + Close(QualName), +} + +pub struct SerializableHandle(Handle); + +impl From for SerializableHandle { + fn from(h: Handle) -> SerializableHandle { + SerializableHandle(h) + } +} + +impl Serialize for SerializableHandle { + fn serialize(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()> + where + S: Serializer, + { + let mut ops = VecDeque::new(); + match traversal_scope { + IncludeNode => ops.push_back(SerializeOp::Open(self.0.clone())), + ChildrenOnly(_) => ops.extend( + self.0 + .children + .lock() + .unwrap() + .iter() + .map(|h| SerializeOp::Open(h.clone())), + ), + } + + while let Some(op) = ops.pop_front() { + match op { + SerializeOp::Open(handle) => match handle.data { + NodeData::Element { + ref name, + ref attrs, + .. + } => { + serializer.start_elem( + name.clone(), + attrs + .lock() + .unwrap() + .iter() + .map(|at| (&at.name, &at.value[..])), + )?; + + ops.reserve(1 + handle.children.lock().unwrap().len()); + ops.push_front(SerializeOp::Close(name.clone())); + + for child in handle.children.lock().unwrap().iter().rev() { + ops.push_front(SerializeOp::Open(child.clone())); + } + } + + NodeData::Doctype { ref name, .. } => serializer.write_doctype(name)?, + + NodeData::Text { ref contents } => { + serializer.write_text(&contents.lock().unwrap())? + } + + NodeData::Comment { ref contents } => serializer.write_comment(contents)?, + + NodeData::ProcessingInstruction { + ref target, + ref contents, + } => serializer.write_processing_instruction(target, contents)?, + + NodeData::Document => panic!("Can't serialize Document node itself"), + }, + + SerializeOp::Close(name) => { + serializer.end_elem(name)?; + } + } + } + + Ok(()) + } +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index b2ff038..8f63749 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -16,6 +16,8 @@ pub mod escape; pub mod files; #[cfg(feature = "audio-flac")] pub mod flac; +#[cfg(feature = "xml5ever")] +pub mod html5ever_arcdom; #[cfg(feature = "image")] pub mod img; #[cfg(feature = "image-jxl")]