From 670f55f3666ae386c7616a53439fb11de7f23a95 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Sun, 12 Nov 2023 09:01:23 +0000 Subject: [PATCH] Update spliter --- src/push/mod.rs | 4 +- src/push/telegram/text.rs | 332 +++++++++++++++++++++++++++++++++++++- 2 files changed, 327 insertions(+), 9 deletions(-) diff --git a/src/push/mod.rs b/src/push/mod.rs index b172143..b061458 100644 --- a/src/push/mod.rs +++ b/src/push/mod.rs @@ -1,6 +1,6 @@ /// [EveryPush](https://github.com/PeanutMelonSeedBigAlmond/EveryPush) client pub mod every_push; -/// Telegram push client -pub mod telegram; /// [Pushdeer](https://github.com/easychen/pushdeer) client pub mod pushdeer; +/// Telegram push client +pub mod telegram; diff --git a/src/push/telegram/text.rs b/src/push/telegram/text.rs index 76743ea..1cfc2c5 100644 --- a/src/push/telegram/text.rs +++ b/src/push/telegram/text.rs @@ -1,7 +1,16 @@ use crate::error::PixivDownloaderError; +use crate::ext::replace::ReplaceWith; use html5ever::tendril::TendrilSink; use html5ever::{parse_document, ParseOpts, QualName}; use markup5ever_rcdom::{Node, NodeData, RcDom}; +use std::collections::BTreeMap; + +fn encode_data + ?Sized>(data: &S) -> String { + data.as_ref() + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") +} #[derive(Clone, Debug, PartialEq)] /// Message Entity Type @@ -112,6 +121,14 @@ impl Stack { } } +fn push_end(ends: &mut BTreeMap>, end: usize, s: String) { + if let Some(i) = ends.get_mut(&end) { + i.push(s); + } else { + ends.insert(end, vec![s]); + } +} + pub struct TextSpliter { entities: Vec, text: String, @@ -124,6 +141,156 @@ impl TextSpliter { TextSpliterBuilder::default() } + pub fn get_str(&mut self, len: usize) -> String { + let mut ends = BTreeMap::>::new(); + let mut cur_pos = 0usize; + let mut text = String::new(); + let mut first = true; + while let Some(pos) = { + if first { + first = false; + self.entities + .iter() + .find(|e| e.offset >= cur_pos && e.offset < len) + .map(|e| e.offset.clone()) + } else { + let fend = ends + .first_key_value() + .filter(|(k, _)| *k <= &len) + .map(|(k, _)| k.clone()); + let start = self + .entities + .iter() + .find(|e| e.offset > cur_pos && e.offset < len) + .map(|e| e.offset.clone()); + if let Some(fend) = fend { + if let Some(start) = start { + Some(start.min(fend)) + } else { + Some(fend) + } + } else { + start + } + } + } { + if cur_pos < pos { + text.push_str(&encode_data(&self.text_get(cur_pos, pos))); + cur_pos = pos; + } + if let Some(end) = ends.remove(&pos) { + for i in end.iter().rev() { + text.push_str(i); + } + } + for i in self.entities.iter() { + if i.offset != pos { + continue; + } + let end = match &i.typ { + MessageEntityType::Url => { + if len >= i.offset + i.length { + "" + } else { + text.push_str(""); + "" + } + } + MessageEntityType::Bold => { + text.push_str(""); + "" + } + MessageEntityType::Italic => { + text.push_str(""); + "" + } + MessageEntityType::Underline => { + text.push_str(""); + "" + } + MessageEntityType::Strikethrough => { + text.push_str(""); + "" + } + MessageEntityType::Spoiler => { + text.push_str(""); + "" + } + MessageEntityType::Code => { + text.push_str(""); + "" + } + MessageEntityType::Pre { language } => match language { + Some(language) => { + text.push_str("
");
+                            "
" + } + None => { + text.push_str("
");
+                            "
" + } + }, + MessageEntityType::TextLink { url } => { + text.push_str(""); + "" + } + MessageEntityType::CustomEmoji { custom_emoji_id } => { + text.push_str(""); + "" + } + }; + push_end(&mut ends, i.offset + i.length, end.to_string()); + } + } + if cur_pos < len { + text.push_str(&encode_data(&self.text_get(cur_pos, len))); + } + for (_, v) in ends.iter() { + for i in v.iter().rev() { + text.push_str(i); + } + } + let entities = self.entities.replace_with(Vec::new()); + for e in entities { + if e.offset + e.length > len { + if e.offset >= len { + self.entities.push(MessageEntity { + offset: e.offset - len, + ..e + }) + } else { + if matches!(e.typ, MessageEntityType::Url) { + self.entities.push(MessageEntity { + offset: 0, + length: e.offset + e.length - len, + typ: MessageEntityType::TextLink { + url: self.text_get(e.offset, e.offset + e.length), + }, + }) + } else { + self.entities.push(MessageEntity { + offset: 0, + length: e.offset + e.length - len, + ..e + }) + } + } + } + } + let v = self.text.encode_utf16().skip(len).collect::>(); + self.text = String::from_utf16_lossy(&v); + text + } + fn is_conflict_with_link_entities(&self, offset: usize, length: usize) -> bool { for entity in &self.entities { if !matches!(entity.typ, MessageEntityType::TextLink { .. }) { @@ -144,6 +311,15 @@ impl TextSpliter { false } + fn is_in_entities(&self, pos: usize) -> bool { + for entity in &self.entities { + if pos >= entity.offset && pos < entity.offset + entity.length { + return true; + } + } + false + } + fn iter(&mut self, node: &Node) { match &node.data { NodeData::Text { contents } => { @@ -311,6 +487,43 @@ impl TextSpliter { } } + pub fn to_html(&mut self, max_len: Option) -> String { + let max_len = max_len.unwrap_or(self.opts._max_length); + let mut len = self.text_len(); + let lens = self + .text + .split('\n') + .map(|s| s.encode_utf16().count()) + .collect::>(); + for i in lens.iter().rev() { + if len <= max_len { + if !self.is_in_entities(len) { + return self.get_str(len); + } + } + len -= i; + if len > 0 { + len -= 1; + } + } + len = self.text_len(); + for i in lens.iter().rev() { + if len <= max_len { + return self.get_str(len); + } + len -= i; + } + let max_len = max_len.min(self.text_len()); + for i in self.entities.iter() { + if max_len > i.offset && max_len < i.offset + i.length { + if matches!(i.typ, MessageEntityType::CustomEmoji { .. }) { + return self.get_str(i.offset); + } + } + } + self.get_str(max_len) + } + pub fn parse + ?Sized>(&mut self, text: &S) -> Result<(), PixivDownloaderError> { let opts = ParseOpts::default(); let dom = parse_document(RcDom::default(), opts) @@ -329,10 +542,17 @@ impl TextSpliter { fn scan_link(&mut self) { let mut offset = 0; - while let Some(i) = self.text[offset..] - .find("http://") - .or_else(|| self.text[offset..].find("https://")) - { + while let Some(i) = { + let i = self.text[offset..].find("http://"); + let i2 = self.text[offset..].find("https://"); + match i { + Some(i) => match i2 { + Some(i2) => Some(i.min(i2)), + None => Some(i), + }, + None => i2, + } + } { let mut length = 0; for c in self.text[offset + i..].chars() { if c != ' ' && c != '\n' && c != '\r' && c != '\t' { @@ -342,7 +562,9 @@ impl TextSpliter { } } let boffset = self.text[..offset + i].encode_utf16().count(); - let tlen = self.text[offset + i..offset + i + length].encode_utf16().count(); + let tlen = self.text[offset + i..offset + i + length] + .encode_utf16() + .count(); if length > 0 && !self.is_conflict_with_link_entities(boffset, tlen) { self.entities.push(MessageEntity { typ: MessageEntityType::Url, @@ -354,15 +576,24 @@ impl TextSpliter { } } - fn sort(&mut self) { self.entities.sort_by(|a, b| { a.offset .cmp(&b.offset) - .then_with(|| a.length.cmp(&b.length)) + .then_with(|| b.length.cmp(&a.length)) }); } + fn text_get(&self, start: usize, end: usize) -> String { + let v = self + .text + .encode_utf16() + .skip(start) + .take(end - start) + .collect::>(); + String::from_utf16_lossy(&v) + } + #[inline] fn text_len(&self) -> usize { self.text.encode_utf16().count() @@ -502,3 +733,90 @@ fn test_parse2() { },] ); } + +#[test] +fn test_split() { + let mut spliter = TextSpliter::default(); + spliter + .parse("testdad1234<>&") + .unwrap(); + assert_eq!( + spliter.to_html(None), + "testdad1234<>&" + ); + assert_eq!(spliter.entities.len(), 0); + assert_eq!(spliter.text, String::from("")); +} + +#[test] +fn test_split2() { + let mut spliter = TextSpliter::default(); + spliter.parse("testtest").unwrap(); + assert_eq!(spliter.to_html(Some(5)), "testt"); + assert_eq!(spliter.entities.len(), 1); + assert_eq!(spliter.text, String::from("est")); + assert_eq!(spliter.to_html(Some(5)), "est"); + assert_eq!(spliter.entities.len(), 0); + assert_eq!(spliter.text, String::from("")); +} + +#[test] +fn test_split3() { + let mut spliter = TextSpliter::default(); + spliter + .parse("testd\nhttps://www.pixiv.net") + .unwrap(); + assert_eq!(spliter.to_html(Some(22)), "testd\n"); + assert_eq!(spliter.entities.len(), 2); + assert_eq!(spliter.to_html(Some(22)), "https://www.pixiv.net"); +} + +#[test] +fn test_split4() { + let mut spliter = TextSpliter::builder().max_length(12).build(); + spliter.parse("https://www.pixiv.net").unwrap(); + assert_eq!( + spliter.to_html(None), + "https://www." + ); + assert_eq!( + spliter.to_html(None), + "pixiv.net" + ); + assert_eq!(spliter.entities.len(), 0); +} + +#[test] +fn test_split5() { + let mut spliter = TextSpliter::builder() + .max_length(12) + .disable_scan_link() + .build(); + spliter.parse("https://www.pixiv.net").unwrap(); + assert_eq!(spliter.to_html(None), "https://www."); + assert_eq!(spliter.to_html(None), "pixiv.net"); + assert_eq!(spliter.entities.len(), 0); +} + +#[test] +fn test_split6() { + let mut spliter = TextSpliter::default(); + spliter + .parse("test\ntest\n123") + .unwrap(); + assert_eq!(spliter.to_html(Some(12)), "test"); + assert_eq!(spliter.entities.len(), 2); +} + +#[test] +fn test_split7() { + let mut spliter = TextSpliter::builder().max_length(3).build(); + spliter + .parse("δΈ­ζ–‡πŸ‘") + .unwrap(); + assert_eq!(spliter.to_html(None), "δΈ­ζ–‡"); + assert_eq!( + spliter.to_html(None), + "πŸ‘" + ); +}