update
This commit is contained in:
201
rssparser.py
201
rssparser.py
@@ -1,13 +1,121 @@
|
||||
# (C) 2021 lifegpc
|
||||
# This file is part of rssbot.
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published
|
||||
# by the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
from xml.dom import minidom
|
||||
from html.parser import HTMLParser
|
||||
from html import escape
|
||||
import sys
|
||||
import requests
|
||||
from traceback import format_exc
|
||||
|
||||
|
||||
class HTMLSimpleParser(HTMLParser):
|
||||
def __init__(self):
|
||||
self.data = ''
|
||||
self.istag = False
|
||||
self.tagContent = ''
|
||||
self.tagAttrs = ''
|
||||
self.imgList = []
|
||||
self.videoList = []
|
||||
HTMLParser.__init__(self)
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
if tag == 'br':
|
||||
data = data + '\n'
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'br':
|
||||
self.data = self.data + '\n'
|
||||
return
|
||||
elif tag == 'img':
|
||||
for key, value in attrs:
|
||||
if key == 'src':
|
||||
self.imgList.append(value)
|
||||
break
|
||||
return
|
||||
elif tag == 'video':
|
||||
p = {}
|
||||
for key, value in attrs:
|
||||
if key == 'src':
|
||||
p['src'] = value
|
||||
if key == 'poster':
|
||||
p['poster'] = value
|
||||
if 'src' in p:
|
||||
self.videoList.append(p)
|
||||
return
|
||||
self.istag = True
|
||||
self.tagContent = ''
|
||||
self.tagAttrs = ''
|
||||
if tag == 'a':
|
||||
for key, value in attrs:
|
||||
if key == 'href':
|
||||
self.tagAttrs = f'{self.tagAttrs} href="{value}"'
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.istag:
|
||||
self.tagContent = self.tagContent + data
|
||||
else:
|
||||
self.data = self.data + data
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
self.istag = False
|
||||
if tag in ['a', 'b', 'i', 'u', 's', 'strong', 'em', 'ins', 'strike', 'del', 'code', 'pre']:
|
||||
self.data = f"{self.data}<{tag}{self.tagAttrs}>{self.tagContent}</{tag}>"
|
||||
self.tagAttrs = ''
|
||||
|
||||
|
||||
class RSSParser:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def check(self):
|
||||
def __checkasratom(self):
|
||||
self._root = self.xmldoc.documentElement
|
||||
if self._root.nodeName != 'feed':
|
||||
return False
|
||||
m = {}
|
||||
itemList = []
|
||||
for i in self._root.childNodes:
|
||||
if i.nodeName == 'entry':
|
||||
itemList.append(self.__dealItemAtom(i))
|
||||
elif i.nodeName == 'link':
|
||||
if 'href' in i.attributes:
|
||||
m[i.nodeName] = i.attributes['href'].nodeValue
|
||||
elif i.nodeName == 'author':
|
||||
if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name':
|
||||
name = i.firstChild
|
||||
if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section':
|
||||
m['author'] = name.firstChild.nodeValue
|
||||
else:
|
||||
if len(i.childNodes) == 0:
|
||||
m[i.nodeName] = i.nodeValue
|
||||
elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
|
||||
m[i.nodeName] = i.firstChild.nodeValue
|
||||
else:
|
||||
m[i.nodeName] = ''
|
||||
for k in i.childNodes:
|
||||
m[i.nodeName] = m[i.nodeName] + k.toxml()
|
||||
if 'title' not in m or m['title'] is None or m['title'] == '':
|
||||
return False
|
||||
self.m = m
|
||||
self.title = m['title']
|
||||
self.ttl = None
|
||||
self.itemList = itemList
|
||||
self._type = 'atom'
|
||||
return True
|
||||
|
||||
def __checkasrss3(self):
|
||||
self._root = self.xmldoc.documentElement
|
||||
if self._root.localName != 'rss' or len(self._root.childNodes) != 1:
|
||||
return False
|
||||
@@ -15,13 +123,94 @@ class RSSParser:
|
||||
if self._root2.localName != 'channel':
|
||||
return False
|
||||
m = {}
|
||||
itemList = []
|
||||
for i in self._root2.childNodes:
|
||||
if i.localName == 'item':
|
||||
pass
|
||||
if i.nodeName == 'item':
|
||||
itemList.append(self.__dealItem(i))
|
||||
elif i.nodeName == 'atom:link':
|
||||
if 'href' in i.attributes:
|
||||
m[i.nodeName] = i.attributes['href'].nodeValue
|
||||
else:
|
||||
m[i.localName] = i.firstChild.nodeValue if len(
|
||||
i.childNodes) > 0 else i.nodeValue
|
||||
print()
|
||||
if len(i.childNodes) == 0:
|
||||
m[i.nodeName] = i.nodeValue
|
||||
elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
|
||||
m[i.nodeName] = i.firstChild.nodeValue
|
||||
else:
|
||||
m[i.nodeName] = ''
|
||||
for k in i.childNodes:
|
||||
m[i.nodeName] = m[i.nodeName] + k.toxml()
|
||||
if 'title' not in m or m['title'] is None or m['title'] == '':
|
||||
return False
|
||||
self.m = m
|
||||
self.title = m['title']
|
||||
self.ttl = None
|
||||
if 'ttl' in m and m['ttl'] is not None and m['ttl'].isnumeric():
|
||||
self.ttl = int(m['ttl'])
|
||||
self.itemList = itemList
|
||||
self._type = 'rss3.0'
|
||||
return True
|
||||
|
||||
def __dealItem(self, node):
|
||||
m = {}
|
||||
for i in node.childNodes:
|
||||
if len(i.childNodes) == 0:
|
||||
m[i.nodeName] = i.nodeValue
|
||||
elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
|
||||
p = HTMLSimpleParser()
|
||||
p.feed(i.firstChild.nodeValue)
|
||||
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
|
||||
m[i.nodeName] = i.firstChild.nodeValue
|
||||
else:
|
||||
m[i.nodeName] = p.data
|
||||
if i.nodeName == 'description':
|
||||
m['imgList'] = p.imgList
|
||||
m['videoList'] = p.videoList
|
||||
else:
|
||||
m[i.nodeName] = ''
|
||||
for k in i.childNodes:
|
||||
m[i.nodeName] = m[i.nodeName] + k.toxml()
|
||||
return m
|
||||
|
||||
def __dealItemAtom(self, node):
|
||||
m = {}
|
||||
for i in node.childNodes:
|
||||
if i.nodeName == 'author':
|
||||
if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name':
|
||||
name = i.firstChild
|
||||
if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section':
|
||||
m['author'] = name.firstChild.nodeValue
|
||||
elif i.nodeName == 'link':
|
||||
if 'href' in i.attributes:
|
||||
m[i.nodeName] = i.attributes['href'].nodeValue
|
||||
elif len(i.childNodes) == 0:
|
||||
m[i.nodeName] = i.nodeValue
|
||||
elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
|
||||
p = HTMLSimpleParser()
|
||||
p.feed(i.firstChild.nodeValue)
|
||||
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
|
||||
m[i.nodeName] = i.firstChild.nodeValue
|
||||
else:
|
||||
m[i.nodeName] = p.data
|
||||
if i.nodeName == 'content':
|
||||
m['imgList'] = p.imgList
|
||||
m['videoList'] = p.videoList
|
||||
m['description'] = m['content']
|
||||
del m['content']
|
||||
else:
|
||||
m[i.nodeName] = ''
|
||||
for k in i.childNodes:
|
||||
m[i.nodeName] = m[i.nodeName] + k.toxml()
|
||||
return m
|
||||
|
||||
def check(self):
|
||||
try:
|
||||
checked = self.__checkasrss3()
|
||||
if not checked:
|
||||
checked = self.__checkasratom()
|
||||
return checked
|
||||
except:
|
||||
print(format_exc())
|
||||
return False
|
||||
|
||||
def normalize(self):
|
||||
self.removeblank(self.xmldoc.documentElement)
|
||||
|
||||
Reference in New Issue
Block a user