diff --git a/textc.py b/textc.py
index 46bb44d..2e96ffe 100644
--- a/textc.py
+++ b/textc.py
@@ -14,12 +14,153 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
from time import strftime, localtime, timezone
-from urllib.parse import unquote
+from urllib.parse import unquote, urlsplit
+from html.parser import HTMLParser
+
+
+class TGHTMLParser(HTMLParser):
+ """Only for calculate message's length and cut message.
+ Do not send these message entries to Telegram."""
+
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.messageEntries = []
+ self.s = ''
+ self.__tagL = []
+ self.__entryL = []
+
+ def close(self):
+ """This function will do some check for message entries after close."""
+ HTMLParser.close(self)
+ for e in self.messageEntries:
+ e['dataend'] = e['end']
+ e['end'] = e['end'] + len(e['tag']) + 3
+
+ def feed(self, s: str):
+ """This function will repelace \\n with <br>.
+ Telegram do not support <br> and use \\n."""
+ HTMLParser.feed(self, s.replace('\n', '
'))
+
+ def handle_data(self, data: str):
+ self.s = self.s + data
+ if len(self.__entryL) > 0:
+ if self.__entryL[-1]['datastart'] == -1:
+ self.__entryL[-1]['datastart'] = self.getpos()[1]
+
+ def handle_starttag(self, tag: str, attrs):
+ if len(self.__entryL) > 0:
+ if self.__entryL[-1]['datastart'] == -1:
+ self.__entryL[-1]['datastart'] = self.getpos()[1]
+ typ = ''
+ if tag in ['b', 'strong']:
+ typ = 'bold'
+ elif tag in ['i', 'em']:
+ typ = 'italic'
+ elif tag in ['u', 'ins']:
+ typ = 'underline'
+ elif tag in ['s', 'strike', 'del']:
+ typ = 'strikethrough'
+ elif tag in ['a']:
+ typ = 'url'
+ link = None
+ for name, value in attrs:
+ if name == 'href':
+ link = value
+ break
+ elif tag in ['code', 'pre']:
+ typ = tag
+ elif tag == 'br':
+ self.s = self.s + '\n'
+ return
+ else:
+ return # unsuppoted tag
+ t = {'type': typ, 'offset': len(self.s), 'length': 0, 'tag': tag, 'start': self.getpos()[
+ 1], 'end': 0, 'datastart': -1}
+ if typ == 'url' and link is not None:
+ t['url'] = link
+ self.__entryL.append(t)
+ self.messageEntries.append(t)
+ self.__tagL.append(tag)
+
+ def handle_endtag(self, tag: str):
+ if len(self.__entryL) > 0:
+ if self.__entryL[-1]['datastart'] == -1:
+ self.__entryL[-1]['datastart'] = self.getpos()[1]
+ if tag not in ['b', 'strong', 'i', 'em', 'u', 'ins', 's', 'strike', 'del', 'a', 'pre', 'code']:
+ return
+ if len(self.__tagL) <= 0:
+ return
+ if self.__entryL[-1]['datastart'] == -1:
+ self.__entryL[-1]['datastart'] = self.getpos()[1]
+ self.__entryL[-1]['end'] = self.getpos()[1]
+ self.__entryL[-1]['length'] = len(self.s) - self.__entryL[-1]['offset']
+ self.__entryL = self.__entryL[:-1]
+ self.__tagL = self.__tagL[:-1]
+
+
+class MessageEntries:
+ def __init__(self, l: list):
+ self.__list = l
+
+ def __len__(self):
+ return len(self.__list)
+
+ def getStr(self, l: int, s: str, re: str) -> (str, str):
+ """split string
+ s is origin string
+ re is parsed string"""
+ s = s.replace(
+ '\n', '
') # the start and end is needed work with
+ t = sorted(self.__list, key=lambda d: d['start'])
+ r = ''
+ i = 0
+ while i < l:
+ for v in t:
+ if v['offset'] == i:
+ r = r + s[v['start']:v['datastart']]
+ r = r + re[i]
+ for v in reversed(t):
+ if v['offset'] + v['length'] - 1 == i:
+ r = r + s[v['dataend']:v['end']]
+ i = i + 1
+ for v in reversed(t):
+ if i - 1 >= v['offset'] and v['offset'] + v['length'] - 1 > i - 1:
+ r = r + s[v['dataend']:v['end']]
+ r2 = ''
+ i = l
+ for v in t:
+ if i > v['offset'] and v['offset'] + v['length'] - 1 >= i:
+ r2 = r2 + s[v['start']:v['datastart']]
+ while i < len(re):
+ for v in t:
+ if v['offset'] == i:
+ r2 = r2 + s[v['start']:v['datastart']]
+ r2 = r2 + re[i]
+ for v in reversed(t):
+ if v['offset'] + v['length'] - 1 == i:
+ r2 = r2 + s[v['dataend']:v['end']]
+ i = i + 1
+ return r, r2
+
+ def isOkWithOrigin(self, l: int) -> bool:
+ for v in self.__list:
+ if l > v['start'] and l < v['end']:
+ return False
+ return True
+
+ def isOkWithRe(self, l: int) -> bool:
+ for v in self.__list:
+ if l > v['offset'] and l < (v['offset'] + v['length']):
+ return False
+ return True
class textc:
def __len__(self):
- return len(self.__str)
+ p = TGHTMLParser()
+ p.feed(self.__str)
+ p.close()
+ return len(p.s)
def __init__(self):
self.__str = ''
@@ -32,20 +173,42 @@ class textc:
return len(self) <= self.__max
def cut(self):
- """TODO: Need check html is not breaked
- TODO: Need calculate char limits after parsed"""
+ """Split string"""
+ p = TGHTMLParser()
+ p.feed(self.__str)
+ p.close()
+ m = MessageEntries(p.messageEntries)
l = self.__str.splitlines(True)
- r = ''
- while len(f"{r}{l[0]}") <= self.__max:
- r = r + l[0]
- l = l[1:]
- if len(r) == 0:
- r = l[0][:self.__max]
- l[0] = l[0][self.__max:]
+ l2 = p.s.splitlines(True)
+ originlen = [] # line's origin length
+ rlen = [] # line's length after parsing
t = ''
- for i in l:
- t = t + i
- self.__str = t
+ t2 = ''
+ z = 0 # calculate the offset because of replace \n with
+ for i in range(min(len(l), len(l2))):
+ if l[i].endswith('\n'):
+ z = z + 1
+ t = t + l[i]
+ t2 = t2 + l2[i]
+ originlen.append(len(t) + 3 * z)
+ rlen.append(len(t2))
+ for i in reversed(range(len(originlen))):
+ if rlen[i] <= self.__max: # check the length
+ # make sure not break HTML
+ if m.isOkWithOrigin(originlen[i]) and m.isOkWithRe(rlen[i]):
+ r = ''
+ for k in range(i + 1):
+ r = r + l[k]
+ l = l[i+1:]
+ t = ''
+ for i in l:
+ t = t + i
+ self.__str = t
+ return r
+ else:
+ r, self.__str = m.getStr(rlen[i], self.__str, p.s)
+ return r
+ r, self.__str = m.getStr(self.__max, self.__str, p.s)
return r
def tostr(self, maxLength: int = 4096):