update

2021-01-07 14:17:02 +08:00
parent bef75562ae
commit 7d0aa5abc2
6 changed files with 289 additions and 58 deletions
--- a/rssparser.py
+++ b/rssparser.py
@@ -15,20 +15,24 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 from xml.dom import minidom
 from html.parser import HTMLParser
-from html import escape
+from html import escape, unescape
 import sys
 import requests
 from traceback import format_exc
+from urllib.parse import urljoin


 class HTMLSimpleParser(HTMLParser):
-    def __init__(self):
+    def __init__(self, baseUrl: str=None):
        self.data = ''
        self.istag = False
        self.tagContent = ''
        self.tagAttrs = ''
        self.imgList = []
        self.videoList = []
+        self.baseUrl = ''
+        if baseUrl is not None:
+            self.baseUrl = baseUrl
        HTMLParser.__init__(self)

    def handle_startendtag(self, tag, attrs):
@@ -42,16 +46,16 @@ class HTMLSimpleParser(HTMLParser):
        elif tag == 'img':
            for key, value in attrs:
                if key == 'src':
-                    self.imgList.append(value)
+                    self.imgList.append(urljoin(self.baseUrl, value))
                    break
            return
        elif tag == 'video':
            p = {}
            for key, value in attrs:
                if key == 'src':
-                    p['src'] = value
+                    p['src'] = urljoin(self.baseUrl, value)
                if key == 'poster':
-                    p['poster'] = value
+                    p['poster'] = urljoin(self.baseUrl, value)
            if 'src' in p:
                self.videoList.append(p)
            return
@@ -61,18 +65,20 @@ class HTMLSimpleParser(HTMLParser):
        if tag == 'a':
            for key, value in attrs:
                if key == 'href':
-                    self.tagAttrs = f'{self.tagAttrs} href="{value}"'
+                    self.tagAttrs = f'{self.tagAttrs} href="{urljoin(self.baseUrl, value)}"'

    def handle_data(self, data):
        if self.istag:
            self.tagContent = self.tagContent + data
        else:
-            self.data = self.data + data
+            self.data = self.data + escape(data)

    def handle_endtag(self, tag):
        self.istag = False
        if tag in ['a', 'b', 'i', 'u', 's', 'strong', 'em', 'ins', 'strike', 'del', 'code', 'pre']:
-            self.data = f"{self.data}<{tag}{self.tagAttrs}>{self.tagContent}</{tag}>"
+            self.data = f"{self.data}<{tag}{self.tagAttrs}>{escape(self.tagContent)}</{tag}>"
+        elif tag not in ['img', 'video', 'br']:
+            self.data = f"{self.data}{escape(self.tagContent)}"
        self.tagAttrs = ''


@@ -90,13 +96,19 @@ class RSSParser:
            if i.nodeName == 'entry':
                itemList.append(self.__dealItemAtom(i))
            elif i.nodeName == 'link':
-                if 'href' in i.attributes:
+                typ = 'text/html'
+                if 'type' in i.attributes:
+                    typ = i.attributes['type'].nodeValue
+                if 'href' in i.attributes and typ == 'text/html':
                    m[i.nodeName] = i.attributes['href'].nodeValue
            elif i.nodeName == 'author':
-                if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name':
-                    name = i.firstChild
-                    if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section':
-                        m['author'] = name.firstChild.nodeValue
+                for k in i.childNodes:
+                    if k.nodeName == 'name':
+                        m['author'] = k.nodeValue
+                        break
+                    elif len(k.childNodes) == 1 and k.firstChild.nodeName == '#cdata-section':
+                        m['author'] = k.firstChild.nodeValue
+                        break
            else:
                if len(i.childNodes) == 0:
                    m[i.nodeName] = i.nodeValue
@@ -115,7 +127,7 @@ class RSSParser:
        self._type = 'atom'
        return True

-    def __checkasrss3(self):
+    def __checkasrss2(self):
        self._root = self.xmldoc.documentElement
        if self._root.localName != 'rss' or len(self._root.childNodes) != 1:
            return False
@@ -147,16 +159,29 @@ class RSSParser:
        if 'ttl' in m and m['ttl'] is not None and m['ttl'].isnumeric():
            self.ttl = int(m['ttl'])
        self.itemList = itemList
-        self._type = 'rss3.0'
+        self._type = 'rss2.0'
        return True

    def __dealItem(self, node):
        m = {}
        for i in node.childNodes:
-            if len(i.childNodes) == 0:
+            if i.nodeName == 'link':
+                if len(i.childNodes) == 0:
+                    m[i.nodeName] = i.nodeValue
+                else:
+                    m[i.nodeName] = ''
+                    for k in i.childNodes:
+                        m[i.nodeName] = m[i.nodeName] + k.toxml()
+                break
+        for i in node.childNodes:
+            if i.nodeName == 'link':
+                continue
+            elif len(i.childNodes) == 0:
                m[i.nodeName] = i.nodeValue
            elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
                p = HTMLSimpleParser()
+                if 'link' in m and m['link'] is not None:
+                    p.baseUrl = m['link']
                p.feed(i.firstChild.nodeValue)
                if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
                    m[i.nodeName] = i.firstChild.nodeValue
@@ -174,28 +199,95 @@ class RSSParser:
    def __dealItemAtom(self, node):
        m = {}
        for i in node.childNodes:
-            if i.nodeName == 'author':
-                if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name':
-                    name = i.firstChild
-                    if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section':
-                        m['author'] = name.firstChild.nodeValue
-            elif i.nodeName == 'link':
+            if i.nodeName == 'link':
                if 'href' in i.attributes:
                    m[i.nodeName] = i.attributes['href'].nodeValue
+        for i in node.childNodes:
+            if i.nodeName == 'author':
+                for k in i.childNodes:
+                    if k.nodeName == 'name':
+                        if k.nodeValue is not None:
+                            m['author'] = k.nodeValue
+                            break
+                        elif len(k.childNodes) == 1 and k.firstChild.nodeName == '#cdata-section':
+                            m['author'] = k.firstChild.nodeValue
+                            break
+            elif i.nodeName == 'link':
+                continue
+            elif i.nodeName in ['title', 'content', 'summary']:
+                typ = 'text'
+                if 'type' in i.attributes:
+                    if i.attributes['type'].nodeValue in ['text', 'html', 'xhtml']:
+                        typ = i.attributes['type'].nodeValue
+                if len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
+                    p = HTMLSimpleParser()
+                    if 'link' in m and m['link'] is not None:
+                        p.baseUrl = m['link']
+                    p.feed(i.firstChild.nodeValue)
+                    if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
+                        m[i.nodeName] = i.firstChild.nodeValue
+                    else:
+                        m[i.nodeName] = p.data
+                    if i.nodeName in ['content', 'summary']:
+                        m['imgList'] = p.imgList
+                        m['videoList'] = p.videoList
+                        m['description'] = m[i.nodeName]
+                        del m[i.nodeName]
+                elif i.nodeValue is None and len(i.childNodes) == 0:
+                    continue
+                elif typ == 'text':
+                    s = ''
+                    if i.nodeValue is not None:
+                        s = i.nodeValue
+                    else:
+                        for k in i.childNodes:
+                            s = s + k.toxml()
+                    m[i.nodeName] = unescape(s)
+                elif typ == 'html':
+                    s = ''
+                    if i.nodeValue is not None:
+                        s = i.nodeValue
+                    else:
+                        for k in i.childNodes:
+                            s = s + k.toxml()
+                    p = HTMLSimpleParser()
+                    if 'link' in m and m['link'] is not None:
+                        p.baseUrl = m['link']
+                    p.feed(unescape(s))
+                    if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
+                        m[i.nodeName] = i.firstChild.nodeValue
+                    else:
+                        m[i.nodeName] = p.data
+                    if i.nodeName in ['content', 'summary']:
+                        m['imgList'] = p.imgList
+                        m['videoList'] = p.videoList
+                        m['description'] = m[i.nodeName]
+                        del m[i.nodeName]
+                elif typ == 'xhtml':
+                    p = HTMLSimpleParser()
+                    if 'link' in m and m['link'] is not None:
+                        p.baseUrl = m['link']
+                    p.feed(i.firstChild.toxml())
+                    if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
+                        m[i.nodeName] = i.firstChild.nodeValue
+                    else:
+                        m[i.nodeName] = p.data
+                    if i.nodeName in ['content', 'summary']:
+                        m['imgList'] = p.imgList
+                        m['videoList'] = p.videoList
+                        m['description'] = m[i.nodeName]
+                        del m[i.nodeName]
            elif len(i.childNodes) == 0:
                m[i.nodeName] = i.nodeValue
            elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
                p = HTMLSimpleParser()
+                if 'link' in m and m['link'] is not None:
+                    p.baseUrl = m['link']
                p.feed(i.firstChild.nodeValue)
                if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
                    m[i.nodeName] = i.firstChild.nodeValue
                else:
                    m[i.nodeName] = p.data
-                if i.nodeName == 'content':
-                    m['imgList'] = p.imgList
-                    m['videoList'] = p.videoList
-                    m['description'] = m['content']
-                    del m['content']
            else:
                m[i.nodeName] = ''
                for k in i.childNodes:
@@ -203,14 +295,15 @@ class RSSParser:
        return m

    def check(self):
-        try:
-            checked = self.__checkasrss3()
-            if not checked:
-                checked = self.__checkasratom()
-            return checked
-        except:
-            print(format_exc())
-            return False
+        for f in [self.__checkasrss2, self.__checkasratom]:
+            try:
+                if f():
+                    self.m['_type'] = self._type
+                    return True
+            except:
+                print(format_exc())
+                pass
+        return False

    def normalize(self):
        self.removeblank(self.xmldoc.documentElement)
@@ -243,4 +336,7 @@ if __name__ == "__main__":
        fn = sys.argv[1]
        p = RSSParser()
        p.parse(fn)
-        p.check()
+        if p.check():
+            print(p._type)
+        else:
+            print('解析失败')