diff --git a/textc.py b/textc.py index 46bb44d..2e96ffe 100644 --- a/textc.py +++ b/textc.py @@ -14,12 +14,153 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from time import strftime, localtime, timezone -from urllib.parse import unquote +from urllib.parse import unquote, urlsplit +from html.parser import HTMLParser + + +class TGHTMLParser(HTMLParser): + """Only for calculate message's length and cut message. + Do not send these message entries to Telegram.""" + + def __init__(self): + HTMLParser.__init__(self) + self.messageEntries = [] + self.s = '' + self.__tagL = [] + self.__entryL = [] + + def close(self): + """This function will do some check for message entries after close.""" + HTMLParser.close(self) + for e in self.messageEntries: + e['dataend'] = e['end'] + e['end'] = e['end'] + len(e['tag']) + 3 + + def feed(self, s: str): + """This function will repelace \\n with <br>. + Telegram do not support <br> and use \\n.""" + HTMLParser.feed(self, s.replace('\n', '
')) + + def handle_data(self, data: str): + self.s = self.s + data + if len(self.__entryL) > 0: + if self.__entryL[-1]['datastart'] == -1: + self.__entryL[-1]['datastart'] = self.getpos()[1] + + def handle_starttag(self, tag: str, attrs): + if len(self.__entryL) > 0: + if self.__entryL[-1]['datastart'] == -1: + self.__entryL[-1]['datastart'] = self.getpos()[1] + typ = '' + if tag in ['b', 'strong']: + typ = 'bold' + elif tag in ['i', 'em']: + typ = 'italic' + elif tag in ['u', 'ins']: + typ = 'underline' + elif tag in ['s', 'strike', 'del']: + typ = 'strikethrough' + elif tag in ['a']: + typ = 'url' + link = None + for name, value in attrs: + if name == 'href': + link = value + break + elif tag in ['code', 'pre']: + typ = tag + elif tag == 'br': + self.s = self.s + '\n' + return + else: + return # unsuppoted tag + t = {'type': typ, 'offset': len(self.s), 'length': 0, 'tag': tag, 'start': self.getpos()[ + 1], 'end': 0, 'datastart': -1} + if typ == 'url' and link is not None: + t['url'] = link + self.__entryL.append(t) + self.messageEntries.append(t) + self.__tagL.append(tag) + + def handle_endtag(self, tag: str): + if len(self.__entryL) > 0: + if self.__entryL[-1]['datastart'] == -1: + self.__entryL[-1]['datastart'] = self.getpos()[1] + if tag not in ['b', 'strong', 'i', 'em', 'u', 'ins', 's', 'strike', 'del', 'a', 'pre', 'code']: + return + if len(self.__tagL) <= 0: + return + if self.__entryL[-1]['datastart'] == -1: + self.__entryL[-1]['datastart'] = self.getpos()[1] + self.__entryL[-1]['end'] = self.getpos()[1] + self.__entryL[-1]['length'] = len(self.s) - self.__entryL[-1]['offset'] + self.__entryL = self.__entryL[:-1] + self.__tagL = self.__tagL[:-1] + + +class MessageEntries: + def __init__(self, l: list): + self.__list = l + + def __len__(self): + return len(self.__list) + + def getStr(self, l: int, s: str, re: str) -> (str, str): + """split string + s is origin string + re is parsed string""" + s = s.replace( + '\n', '
') # the start and end is needed work with
+ t = sorted(self.__list, key=lambda d: d['start']) + r = '' + i = 0 + while i < l: + for v in t: + if v['offset'] == i: + r = r + s[v['start']:v['datastart']] + r = r + re[i] + for v in reversed(t): + if v['offset'] + v['length'] - 1 == i: + r = r + s[v['dataend']:v['end']] + i = i + 1 + for v in reversed(t): + if i - 1 >= v['offset'] and v['offset'] + v['length'] - 1 > i - 1: + r = r + s[v['dataend']:v['end']] + r2 = '' + i = l + for v in t: + if i > v['offset'] and v['offset'] + v['length'] - 1 >= i: + r2 = r2 + s[v['start']:v['datastart']] + while i < len(re): + for v in t: + if v['offset'] == i: + r2 = r2 + s[v['start']:v['datastart']] + r2 = r2 + re[i] + for v in reversed(t): + if v['offset'] + v['length'] - 1 == i: + r2 = r2 + s[v['dataend']:v['end']] + i = i + 1 + return r, r2 + + def isOkWithOrigin(self, l: int) -> bool: + for v in self.__list: + if l > v['start'] and l < v['end']: + return False + return True + + def isOkWithRe(self, l: int) -> bool: + for v in self.__list: + if l > v['offset'] and l < (v['offset'] + v['length']): + return False + return True class textc: def __len__(self): - return len(self.__str) + p = TGHTMLParser() + p.feed(self.__str) + p.close() + return len(p.s) def __init__(self): self.__str = '' @@ -32,20 +173,42 @@ class textc: return len(self) <= self.__max def cut(self): - """TODO: Need check html is not breaked - TODO: Need calculate char limits after parsed""" + """Split string""" + p = TGHTMLParser() + p.feed(self.__str) + p.close() + m = MessageEntries(p.messageEntries) l = self.__str.splitlines(True) - r = '' - while len(f"{r}{l[0]}") <= self.__max: - r = r + l[0] - l = l[1:] - if len(r) == 0: - r = l[0][:self.__max] - l[0] = l[0][self.__max:] + l2 = p.s.splitlines(True) + originlen = [] # line's origin length + rlen = [] # line's length after parsing t = '' - for i in l: - t = t + i - self.__str = t + t2 = '' + z = 0 # calculate the offset because of replace \n with
+ for i in range(min(len(l), len(l2))): + if l[i].endswith('\n'): + z = z + 1 + t = t + l[i] + t2 = t2 + l2[i] + originlen.append(len(t) + 3 * z) + rlen.append(len(t2)) + for i in reversed(range(len(originlen))): + if rlen[i] <= self.__max: # check the length + # make sure not break HTML + if m.isOkWithOrigin(originlen[i]) and m.isOkWithRe(rlen[i]): + r = '' + for k in range(i + 1): + r = r + l[k] + l = l[i+1:] + t = '' + for i in l: + t = t + i + self.__str = t + return r + else: + r, self.__str = m.getStr(rlen[i], self.__str, p.s) + return r + r, self.__str = m.getStr(self.__max, self.__str, p.s) return r def tostr(self, maxLength: int = 4096):