From d422f1b9c861aeb5e6203fef51eab95dbf5f4b23 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Tue, 19 Jan 2021 21:58:50 +0800 Subject: [PATCH] fix bug in rss html parser --- rssparser.py | 65 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/rssparser.py b/rssparser.py index a59f6eb..d5f254b 100644 --- a/rssparser.py +++ b/rssparser.py @@ -22,12 +22,29 @@ from traceback import format_exc from urllib.parse import urljoin +class HTMLContent: + def __init__(self): + self.__list = [] + + def add(self, s: str, needescaped: bool = False): + self.__list.append((s, needescaped)) + + def export(self) -> str: + r = '' + for s, e in self.__list: + if e: + r = r + escape(s) + else: + r = r + s + return r + + class HTMLSimpleParser(HTMLParser): - def __init__(self, baseUrl: str=None): + def __init__(self, baseUrl: str = None): self.data = '' - self.istag = False - self.tagContent = '' - self.tagAttrs = '' + self.tagName = [] + self.tagContent = [] + self.tagAttrs = [] self.imgList = [] self.videoList = [] self.baseUrl = '' @@ -37,11 +54,17 @@ class HTMLSimpleParser(HTMLParser): def handle_startendtag(self, tag, attrs): if tag == 'br': - self.data = self.data + '\n' + if len(self.tagName) == 0: + self.data = self.data + '\n' + else: + self.tagContent[-1].add('\n') def handle_starttag(self, tag, attrs): if tag == 'br': - self.data = self.data + '\n' + if len(self.tagName) == 0: + self.data = self.data + '\n' + else: + self.tagContent[-1].add('\n') return elif tag == 'img': for key, value in attrs: @@ -59,27 +82,37 @@ class HTMLSimpleParser(HTMLParser): if 'src' in p: self.videoList.append(p) return - self.istag = True - self.tagContent = '' - self.tagAttrs = '' + self.tagName.append(tag) + self.tagContent.append(HTMLContent()) + self.tagAttrs.append('') if tag == 'a': for key, value in attrs: if key == 'href': - self.tagAttrs = f'{self.tagAttrs} href="{urljoin(self.baseUrl, value)}"' + self.tagAttrs[-1] = f'{self.tagAttrs[-1]} href="{urljoin(self.baseUrl, value)}"' def handle_data(self, data): - if self.istag: - self.tagContent = self.tagContent + data + if len(self.tagName) > 0: + self.tagContent[-1].add(data, True) else: self.data = self.data + escape(data) def handle_endtag(self, tag): - self.istag = False if tag in ['a', 'b', 'i', 'u', 's', 'strong', 'em', 'ins', 'strike', 'del', 'code', 'pre']: - self.data = f"{self.data}<{tag}{self.tagAttrs}>{escape(self.tagContent)}" + if len(self.tagName) == 1: + self.data = f"{self.data}<{tag}{self.tagAttrs[-1]}>{self.tagContent[-1].export()}" + elif len(self.tagName) > 1: + self.tagContent[-2].add( + f"<{tag}{self.tagAttrs[-1]}>{self.tagContent[-1].export()}") elif tag not in ['img', 'video', 'br']: - self.data = f"{self.data}{escape(self.tagContent)}" - self.tagAttrs = '' + if len(self.tagName) == 1: + self.data = f"{self.data}{self.tagContent[-1].export()}" + elif len(self.tagName) > 1: + self.tagContent[-2].add(f"{self.tagContent[-1].export()}") + else: + return + self.tagName = self.tagName[:-1] + self.tagContent = self.tagContent[:-1] + self.tagAttrs = self.tagAttrs[:-1] class RSSParser: