update

2021-01-06 18:54:28 +08:00
parent afa2a8ea17
commit bb0065c762
9 changed files with 1432 additions and 24 deletions
--- a/rssparser.py
+++ b/rssparser.py
@@ -1,13 +1,121 @@
+# (C) 2021 lifegpc
+# This file is part of rssbot.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 from xml.dom import minidom
+from html.parser import HTMLParser
+from html import escape
 import sys
 import requests
+from traceback import format_exc
+
+
+class HTMLSimpleParser(HTMLParser):
+    def __init__(self):
+        self.data = ''
+        self.istag = False
+        self.tagContent = ''
+        self.tagAttrs = ''
+        self.imgList = []
+        self.videoList = []
+        HTMLParser.__init__(self)
+
+    def handle_startendtag(self, tag, attrs):
+        if tag == 'br':
+            data = data + '\n'
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'br':
+            self.data = self.data + '\n'
+            return
+        elif tag == 'img':
+            for key, value in attrs:
+                if key == 'src':
+                    self.imgList.append(value)
+                    break
+            return
+        elif tag == 'video':
+            p = {}
+            for key, value in attrs:
+                if key == 'src':
+                    p['src'] = value
+                if key == 'poster':
+                    p['poster'] = value
+            if 'src' in p:
+                self.videoList.append(p)
+            return
+        self.istag = True
+        self.tagContent = ''
+        self.tagAttrs = ''
+        if tag == 'a':
+            for key, value in attrs:
+                if key == 'href':
+                    self.tagAttrs = f'{self.tagAttrs} href="{value}"'
+
+    def handle_data(self, data):
+        if self.istag:
+            self.tagContent = self.tagContent + data
+        else:
+            self.data = self.data + data
+
+    def handle_endtag(self, tag):
+        self.istag = False
+        if tag in ['a', 'b', 'i', 'u', 's', 'strong', 'em', 'ins', 'strike', 'del', 'code', 'pre']:
+            self.data = f"{self.data}<{tag}{self.tagAttrs}>{self.tagContent}</{tag}>"
+        self.tagAttrs = ''


 class RSSParser:
    def __init__(self):
        pass

-    def check(self):
+    def __checkasratom(self):
+        self._root = self.xmldoc.documentElement
+        if self._root.nodeName != 'feed':
+            return False
+        m = {}
+        itemList = []
+        for i in self._root.childNodes:
+            if i.nodeName == 'entry':
+                itemList.append(self.__dealItemAtom(i))
+            elif i.nodeName == 'link':
+                if 'href' in i.attributes:
+                    m[i.nodeName] = i.attributes['href'].nodeValue
+            elif i.nodeName == 'author':
+                if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name':
+                    name = i.firstChild
+                    if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section':
+                        m['author'] = name.firstChild.nodeValue
+            else:
+                if len(i.childNodes) == 0:
+                    m[i.nodeName] = i.nodeValue
+                elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
+                    m[i.nodeName] = i.firstChild.nodeValue
+                else:
+                    m[i.nodeName] = ''
+                    for k in i.childNodes:
+                        m[i.nodeName] = m[i.nodeName] + k.toxml()
+        if 'title' not in m or m['title'] is None or m['title'] == '':
+            return False
+        self.m = m
+        self.title = m['title']
+        self.ttl = None
+        self.itemList = itemList
+        self._type = 'atom'
+        return True
+
+    def __checkasrss3(self):
        self._root = self.xmldoc.documentElement
        if self._root.localName != 'rss' or len(self._root.childNodes) != 1:
            return False
@@ -15,13 +123,94 @@ class RSSParser:
        if self._root2.localName != 'channel':
            return False
        m = {}
+        itemList = []
        for i in self._root2.childNodes:
-            if i.localName == 'item':
-                pass
+            if i.nodeName == 'item':
+                itemList.append(self.__dealItem(i))
+            elif i.nodeName == 'atom:link':
+                if 'href' in i.attributes:
+                    m[i.nodeName] = i.attributes['href'].nodeValue
            else:
-                m[i.localName] = i.firstChild.nodeValue if len(
-                    i.childNodes) > 0 else i.nodeValue
-        print()
+                if len(i.childNodes) == 0:
+                    m[i.nodeName] = i.nodeValue
+                elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
+                    m[i.nodeName] = i.firstChild.nodeValue
+                else:
+                    m[i.nodeName] = ''
+                    for k in i.childNodes:
+                        m[i.nodeName] = m[i.nodeName] + k.toxml()
+        if 'title' not in m or m['title'] is None or m['title'] == '':
+            return False
+        self.m = m
+        self.title = m['title']
+        self.ttl = None
+        if 'ttl' in m and m['ttl'] is not None and m['ttl'].isnumeric():
+            self.ttl = int(m['ttl'])
+        self.itemList = itemList
+        self._type = 'rss3.0'
+        return True
+
+    def __dealItem(self, node):
+        m = {}
+        for i in node.childNodes:
+            if len(i.childNodes) == 0:
+                m[i.nodeName] = i.nodeValue
+            elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
+                p = HTMLSimpleParser()
+                p.feed(i.firstChild.nodeValue)
+                if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
+                    m[i.nodeName] = i.firstChild.nodeValue
+                else:
+                    m[i.nodeName] = p.data
+                if i.nodeName == 'description':
+                    m['imgList'] = p.imgList
+                    m['videoList'] = p.videoList
+            else:
+                m[i.nodeName] = ''
+                for k in i.childNodes:
+                    m[i.nodeName] = m[i.nodeName] + k.toxml()
+        return m
+
+    def __dealItemAtom(self, node):
+        m = {}
+        for i in node.childNodes:
+            if i.nodeName == 'author':
+                if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name':
+                    name = i.firstChild
+                    if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section':
+                        m['author'] = name.firstChild.nodeValue
+            elif i.nodeName == 'link':
+                if 'href' in i.attributes:
+                    m[i.nodeName] = i.attributes['href'].nodeValue
+            elif len(i.childNodes) == 0:
+                m[i.nodeName] = i.nodeValue
+            elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
+                p = HTMLSimpleParser()
+                p.feed(i.firstChild.nodeValue)
+                if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
+                    m[i.nodeName] = i.firstChild.nodeValue
+                else:
+                    m[i.nodeName] = p.data
+                if i.nodeName == 'content':
+                    m['imgList'] = p.imgList
+                    m['videoList'] = p.videoList
+                    m['description'] = m['content']
+                    del m['content']
+            else:
+                m[i.nodeName] = ''
+                for k in i.childNodes:
+                    m[i.nodeName] = m[i.nodeName] + k.toxml()
+        return m
+
+    def check(self):
+        try:
+            checked = self.__checkasrss3()
+            if not checked:
+                checked = self.__checkasratom()
+            return checked
+        except:
+            print(format_exc())
+            return False

    def normalize(self):
        self.removeblank(self.xmldoc.documentElement)