update
This commit is contained in:
64
RSSEntry.py
64
RSSEntry.py
@@ -15,6 +15,9 @@
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
from json import loads
|
||||
from config import RSSConfig
|
||||
from time import time_ns
|
||||
from typing import List
|
||||
from hashl import sha256WithBase64
|
||||
|
||||
|
||||
class ChatEntry:
|
||||
@@ -28,8 +31,66 @@ class ChatEntry:
|
||||
self.config = RSSConfig()
|
||||
|
||||
|
||||
class HashEntry:
|
||||
def __init__(self, data=None, id: str = None, hash: str = None):
|
||||
self.id = data[0] if data is not None and data[0] is not None else None
|
||||
self.hash = data[1] if data is not None and data[1] is not None else None
|
||||
self.time = data[2] if data is not None and data[2] is not None else time_ns()
|
||||
if id is not None:
|
||||
self.id = id
|
||||
if hash is not None:
|
||||
self.hash = hash
|
||||
|
||||
|
||||
def calHash(url: dict, item: dict) -> HashEntry:
|
||||
hashd = sha256WithBase64(url)
|
||||
hasht = url
|
||||
if 'title' in item and item['title'] is not None:
|
||||
hasht = hasht + item['title']
|
||||
if 'link' in item and item['link'] is not None:
|
||||
hasht = hasht + item['link']
|
||||
if 'description' in item and item['description'] is not None:
|
||||
hasht = hasht + item['description']
|
||||
hashed = sha256WithBase64(hasht)
|
||||
return HashEntry(id=hashd, hash=hashed)
|
||||
|
||||
|
||||
class HashEntries:
|
||||
def __init__(self, maxCount: int = 100):
|
||||
self.__list = []
|
||||
self.__maxCount = maxCount if maxCount is not None and maxCount >= 1 else 100
|
||||
|
||||
def __removeMax(self):
|
||||
self.__sort()
|
||||
while len(self.__list) > self.__maxCount:
|
||||
t = self.__list[0]
|
||||
self.__list.remove(t)
|
||||
|
||||
def __sort(self, reverse: bool = False):
|
||||
self.__list.sort(key=lambda d: d.time, reverse=reverse)
|
||||
|
||||
def add(self, d: HashEntry):
|
||||
if d.hash is not None and d.id is not None:
|
||||
for v in self.__list:
|
||||
if v.hash == d.hash and v.id == d.id:
|
||||
return
|
||||
self.__list.append(d)
|
||||
self.__removeMax()
|
||||
|
||||
def getList(self) -> List[HashEntry]:
|
||||
self.__removeMax()
|
||||
r = []
|
||||
for i in self.__list:
|
||||
r.append(i)
|
||||
return r
|
||||
|
||||
def setMaxCount(self, maxCount: int):
|
||||
self.__maxCount = maxCount if maxCount >= 1 else 100
|
||||
self.__removeMax()
|
||||
|
||||
|
||||
class RSSEntry:
|
||||
def __init__(self, data=None):
|
||||
def __init__(self, data=None, maxCount: int = 100):
|
||||
self.title = None
|
||||
if data is not None and data[0] is not None:
|
||||
self.title = data[0]
|
||||
@@ -46,3 +107,4 @@ class RSSEntry:
|
||||
if data is not None and data[4] is not None:
|
||||
self.id = data[4]
|
||||
self.chatList = []
|
||||
self.hashList = HashEntries(maxCount)
|
||||
|
||||
@@ -21,10 +21,12 @@ class RSSConfig:
|
||||
self.disable_web_page_preview = False
|
||||
self.show_RSS_title = True
|
||||
self.show_Content_title = True
|
||||
self.show_content = True
|
||||
self.send_media = True
|
||||
if d is not None:
|
||||
for k in d.keys():
|
||||
if hasattr(self, k):
|
||||
setattr(self, k, d[k])
|
||||
|
||||
def toJson(self):
|
||||
return dumps({'disable_web_page_preview': self.disable_web_page_preview, 'show_RSS_title': self.show_RSS_title, 'show_Content_title': self.show_Content_title}, ensure_ascii=False)
|
||||
return dumps({'disable_web_page_preview': self.disable_web_page_preview, 'show_RSS_title': self.show_RSS_title, 'show_Content_title': self.show_Content_title, 'show_content': self.show_content, 'send_media': self.send_media}, ensure_ascii=False)
|
||||
|
||||
43
database.py
43
database.py
@@ -15,7 +15,7 @@
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
import sqlite3
|
||||
from config import RSSConfig
|
||||
from RSSEntry import RSSEntry, ChatEntry
|
||||
from RSSEntry import RSSEntry, ChatEntry, HashEntry, HashEntries
|
||||
from typing import List
|
||||
from enum import Enum, unique
|
||||
from threading import Lock
|
||||
@@ -84,13 +84,15 @@ PRIMARY KEY (hash)
|
||||
)''')
|
||||
self._db.commit()
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, m):
|
||||
self._version = [1, 0, 0, 0]
|
||||
self._value_lock = Lock()
|
||||
self._db = sqlite3.connect('data.db', check_same_thread=False)
|
||||
ok = self.__check_database()
|
||||
if not ok:
|
||||
self.__create_table()
|
||||
from rssbot import main
|
||||
self._main: main = m
|
||||
|
||||
def __removeRSSEntry(self, id: str) -> bool:
|
||||
try:
|
||||
@@ -106,7 +108,7 @@ PRIMARY KEY (hash)
|
||||
f'INSERT INTO config VALUES ({self._version[0]}, {self._version[1]}, {self._version[2]}, {self._version[3]});')
|
||||
self._db.commit()
|
||||
|
||||
def addRSSList(self, title: str, url: str, chatId: int, config: RSSConfig, ttl: int = None):
|
||||
def addRSSList(self, title: str, url: str, chatId: int, config: RSSConfig, ttl: int = None, hashEntries: HashEntries = None):
|
||||
with self._value_lock:
|
||||
try:
|
||||
hashd = sha256WithBase64(url)
|
||||
@@ -115,20 +117,37 @@ PRIMARY KEY (hash)
|
||||
has_data = False
|
||||
for i in cur:
|
||||
has_data = True
|
||||
break
|
||||
if has_data:
|
||||
self._db.execute(f'DELETE FROM RSSList WHERE id="{hashd}"')
|
||||
self._db.execute(
|
||||
f"INSERT INTO RSSList VALUES ('{dealtext(title)}', '{dealtext(url)}', {ttl if ttl is not None else 'null'}, null, '{hashd}')")
|
||||
self._db.execute(
|
||||
f"UPDATE RSSList SET title='{dealtext(title)}', ttl={ttl if ttl is not None else 'null'} WHERE id='{hashd}'")
|
||||
else:
|
||||
self._db.execute(
|
||||
f"INSERT INTO RSSList VALUES ('{dealtext(title)}', '{dealtext(url)}', {ttl if ttl is not None else 'null'}, null, '{hashd}')")
|
||||
cur = self._db.execute(
|
||||
f'SELECT * FROM chatList WHERE id="{hashd}" AND chatId={chatId}')
|
||||
has_data = False
|
||||
has_data2 = False
|
||||
for i in cur:
|
||||
has_data = True
|
||||
if has_data:
|
||||
has_data2 = True
|
||||
break
|
||||
if has_data2:
|
||||
self._db.execute(
|
||||
f'DELETE FROM chatList WHERE id="{hashd}" AND chatId={chatId}')
|
||||
self._db.execute(
|
||||
f"INSERT INTO chatList VALUES ({chatId}, '{hashd}', '{dealtext(config.toJson())}')")
|
||||
if hashEntries is not None and not has_data:
|
||||
cur = self._db.execute(
|
||||
f"SELECT * FROM hashList WHERE id='{hashd}'")
|
||||
has_data3 = False
|
||||
for i in cur:
|
||||
has_data3 = True
|
||||
break
|
||||
if has_data3:
|
||||
self._db.execute(
|
||||
f"DELETE FROM hashList WHERE ID='{hashd}'")
|
||||
for v in hashEntries.getList():
|
||||
self._db.execute(
|
||||
f"INSERT INTO hashList VALUES ('{v.id}', '{v.hash}', {v.time})")
|
||||
self._db.commit()
|
||||
return True
|
||||
except:
|
||||
@@ -139,12 +158,16 @@ PRIMARY KEY (hash)
|
||||
cur = self._db.execute(f'SELECT * FROM RSSList;')
|
||||
r = []
|
||||
for i in cur:
|
||||
temp = RSSEntry(i)
|
||||
temp = RSSEntry(i, self._main._setting._maxCount)
|
||||
cur2 = self._db.execute(
|
||||
f'SELECT * FROM chatList WHERE id="{temp.id}"')
|
||||
for i2 in cur2:
|
||||
temp2 = ChatEntry(i2)
|
||||
temp.chatList.append(temp2)
|
||||
cur3 = self._db.execute(
|
||||
f"SELECT * FROM hashList WHERE id='{temp.id}' ORDER BY time")
|
||||
for i3 in cur3:
|
||||
temp.hashList.add(HashEntry(i3))
|
||||
if len(temp.chatList) == 0:
|
||||
self.__removeRSSEntry(temp.id)
|
||||
else:
|
||||
|
||||
@@ -27,3 +27,4 @@ class settings:
|
||||
if len(l) == 2:
|
||||
d[l[0]] = l[1]
|
||||
self._token = d['token'] if 'token' in d else None
|
||||
self._maxCount = int(d['maxCount']) if 'maxCount' in d and d['maxCount'].isnumeric() else 100
|
||||
|
||||
67
rssbot.py
67
rssbot.py
@@ -14,6 +14,7 @@
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
from database import database, userStatus, RSSConfig
|
||||
from RSSEntry import HashEntry, HashEntries, calHash
|
||||
from os.path import exists
|
||||
from readset import settings
|
||||
from requests import Session
|
||||
@@ -46,10 +47,14 @@ def getMediaInfo(m: dict, config: RSSConfig = RSSConfig()) -> str:
|
||||
s = f"""{s}\n群/频道ID:{m['chatId']}"""
|
||||
elif 'userId' in m and m['userId'] is not None:
|
||||
s = f"""{s}\n<a href="tg://user?id={m['userId']}">订阅的账号</a>"""
|
||||
if '_type' in m and m['_type'] is not None:
|
||||
s = f"""{s}\n类型:{m['_type']}"""
|
||||
s = f"{s}\n设置:"
|
||||
s = f"{s}\n禁用预览:{config.disable_web_page_preview}"
|
||||
s = f"{s}\n显示RSS标题:{config.show_RSS_title}"
|
||||
s = f"{s}\n显示内容标题:{config.show_Content_title}"
|
||||
s = f"{s}\n显示内容:{config.show_content}"
|
||||
s = f"{s}\n发送媒体:{config.send_media}"
|
||||
return s
|
||||
|
||||
|
||||
@@ -63,6 +68,8 @@ class InlineKeyBoardCallBack(Enum):
|
||||
DisableWebPagePreview = 6
|
||||
ShowRSSTitle = 7
|
||||
ShowContentTitle = 8
|
||||
ShowContent = 9
|
||||
SendMedia = 10
|
||||
|
||||
|
||||
def getInlineKeyBoardWhenRSS(hashd: str, m: dict) -> str:
|
||||
@@ -108,6 +115,14 @@ def getInlineKeyBoardWhenRSS2(hashd: str, config: RSSConfig) -> str:
|
||||
temp = '隐藏内容标题' if config.show_Content_title else '显示内容标题'
|
||||
d[i].append(
|
||||
{'text': temp, 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.ShowContentTitle.value}'})
|
||||
temp = '隐藏内容' if config.show_content else '显示内容'
|
||||
d[i].append(
|
||||
{'text': temp, 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.ShowContent.value}'})
|
||||
d.append([])
|
||||
i = i + 1
|
||||
temp = '禁用发送媒体' if config.send_media else '启用发送媒体'
|
||||
d[i].append(
|
||||
{'text': temp, 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.SendMedia.value}'})
|
||||
d[i].append(
|
||||
{'text': '返回', 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.BackToNormalPage.value}'})
|
||||
return {'inline_keyboard': d}
|
||||
@@ -148,14 +163,14 @@ class main:
|
||||
elif 'link' in content and content['link'] is not None and content['link'] != '':
|
||||
text.addtotext(
|
||||
f"""<a href="{content['link']}">{escape(content['link'])}</a>""")
|
||||
if 'description' in content and content['description'] is not None and content['description'] != '':
|
||||
if config.show_content and 'description' in content and content['description'] is not None and content['description'] != '':
|
||||
text.addtotext(content['description'])
|
||||
|
||||
def getListCount(content: dict, key: str):
|
||||
if key not in content and content[key] is None:
|
||||
if key not in content or content[key] is None:
|
||||
return 0
|
||||
return len(content[key])
|
||||
if getListCount(content, 'imgList') == 0 and getListCount(content, 'videoList') == 0:
|
||||
if not config.send_media or (getListCount(content, 'imgList') == 0 and getListCount(content, 'videoList') == 0):
|
||||
if config.disable_web_page_preview:
|
||||
di['disable_web_page_preview'] = True
|
||||
di['text'] = text.tostr()
|
||||
@@ -218,14 +233,14 @@ class main:
|
||||
self._upi = i['update_id'] + 1
|
||||
|
||||
def start(self):
|
||||
self._db = database()
|
||||
if not exists('settings.txt'):
|
||||
print('找不到settings.txt')
|
||||
return -1
|
||||
self._setting = settings('settings.txt')
|
||||
if self._setting._token is None:
|
||||
print('没有机器人token')
|
||||
return -1
|
||||
self._db = database(self)
|
||||
if not exists('settings.txt'):
|
||||
print('找不到settings.txt')
|
||||
return -1
|
||||
self._r = Session()
|
||||
self._me = self._request('getMe')
|
||||
self._rssMetaList = rssMetaList()
|
||||
@@ -390,9 +405,9 @@ class messageHandle(Thread):
|
||||
continue
|
||||
if chatMember['status'] not in ['creator', 'administrator']:
|
||||
continue
|
||||
if re2['type'] == 'channel' and ('can_post_messages' not in chatMember or not chatMember['can_post_messages']):
|
||||
if re2['type'] == 'channel' and chatMember['status'] == 'administrator' and ('can_post_messages' not in chatMember or not chatMember['can_post_messages']):
|
||||
continue
|
||||
if re2['type'] == 'channel' and ('can_edit_messages' not in chatMember or not chatMember['can_edit_messages']):
|
||||
if re2['type'] == 'channel' and chatMember['status'] == 'administrator' and ('can_edit_messages' not in chatMember or not chatMember['can_edit_messages']):
|
||||
continue
|
||||
chatM = chatMember
|
||||
if chatM is None:
|
||||
@@ -564,7 +579,13 @@ class callbackQueryHandle(Thread):
|
||||
return
|
||||
config = self._rssMeta.config
|
||||
ttl = self._rssMeta.meta['ttl'] if 'ttl' in self._rssMeta.meta else None
|
||||
suc = self._main._db.addRSSList(title, url, chatId, config, ttl)
|
||||
hashEntries = HashEntries(self._main._setting._maxCount)
|
||||
tempList = self._rssMeta.itemList.copy()
|
||||
tempList.reverse()
|
||||
for v in tempList[-100:]:
|
||||
hashEntries.add(calHash(url, v))
|
||||
suc = self._main._db.addRSSList(
|
||||
title, url, chatId, config, ttl, hashEntries)
|
||||
if suc:
|
||||
self.answer('订阅成功!')
|
||||
else:
|
||||
@@ -677,6 +698,32 @@ class callbackQueryHandle(Thread):
|
||||
self._main._request("editMessageText", "post", json=di)
|
||||
self.answer()
|
||||
return
|
||||
elif self._inlineKeyBoardCommand == InlineKeyBoardCallBack.ShowContent:
|
||||
self._rssMeta.config.show_content = not self._rssMeta.config.show_content
|
||||
di = {'chat_id': self._rssMeta.chatId,
|
||||
'message_id': self._rssMeta.messageId}
|
||||
di['text'] = getMediaInfo(
|
||||
self._rssMeta.meta, self._rssMeta.config)
|
||||
di['parse_mode'] = 'HTML'
|
||||
di['disable_web_page_preview'] = True
|
||||
di['reply_markup'] = getInlineKeyBoardWhenRSS2(
|
||||
self._hashd, self._rssMeta.config)
|
||||
self._main._request("editMessageText", "post", json=di)
|
||||
self.answer()
|
||||
return
|
||||
elif self._inlineKeyBoardCommand == InlineKeyBoardCallBack.SendMedia:
|
||||
self._rssMeta.config.send_media = not self._rssMeta.config.send_media
|
||||
di = {'chat_id': self._rssMeta.chatId,
|
||||
'message_id': self._rssMeta.messageId}
|
||||
di['text'] = getMediaInfo(
|
||||
self._rssMeta.meta, self._rssMeta.config)
|
||||
di['parse_mode'] = 'HTML'
|
||||
di['disable_web_page_preview'] = True
|
||||
di['reply_markup'] = getInlineKeyBoardWhenRSS2(
|
||||
self._hashd, self._rssMeta.config)
|
||||
self._main._request("editMessageText", "post", json=di)
|
||||
self.answer()
|
||||
return
|
||||
else:
|
||||
self.answer('未知的按钮。')
|
||||
return
|
||||
|
||||
168
rssparser.py
168
rssparser.py
@@ -15,20 +15,24 @@
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
from xml.dom import minidom
|
||||
from html.parser import HTMLParser
|
||||
from html import escape
|
||||
from html import escape, unescape
|
||||
import sys
|
||||
import requests
|
||||
from traceback import format_exc
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
class HTMLSimpleParser(HTMLParser):
|
||||
def __init__(self):
|
||||
def __init__(self, baseUrl: str=None):
|
||||
self.data = ''
|
||||
self.istag = False
|
||||
self.tagContent = ''
|
||||
self.tagAttrs = ''
|
||||
self.imgList = []
|
||||
self.videoList = []
|
||||
self.baseUrl = ''
|
||||
if baseUrl is not None:
|
||||
self.baseUrl = baseUrl
|
||||
HTMLParser.__init__(self)
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
@@ -42,16 +46,16 @@ class HTMLSimpleParser(HTMLParser):
|
||||
elif tag == 'img':
|
||||
for key, value in attrs:
|
||||
if key == 'src':
|
||||
self.imgList.append(value)
|
||||
self.imgList.append(urljoin(self.baseUrl, value))
|
||||
break
|
||||
return
|
||||
elif tag == 'video':
|
||||
p = {}
|
||||
for key, value in attrs:
|
||||
if key == 'src':
|
||||
p['src'] = value
|
||||
p['src'] = urljoin(self.baseUrl, value)
|
||||
if key == 'poster':
|
||||
p['poster'] = value
|
||||
p['poster'] = urljoin(self.baseUrl, value)
|
||||
if 'src' in p:
|
||||
self.videoList.append(p)
|
||||
return
|
||||
@@ -61,18 +65,20 @@ class HTMLSimpleParser(HTMLParser):
|
||||
if tag == 'a':
|
||||
for key, value in attrs:
|
||||
if key == 'href':
|
||||
self.tagAttrs = f'{self.tagAttrs} href="{value}"'
|
||||
self.tagAttrs = f'{self.tagAttrs} href="{urljoin(self.baseUrl, value)}"'
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.istag:
|
||||
self.tagContent = self.tagContent + data
|
||||
else:
|
||||
self.data = self.data + data
|
||||
self.data = self.data + escape(data)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
self.istag = False
|
||||
if tag in ['a', 'b', 'i', 'u', 's', 'strong', 'em', 'ins', 'strike', 'del', 'code', 'pre']:
|
||||
self.data = f"{self.data}<{tag}{self.tagAttrs}>{self.tagContent}</{tag}>"
|
||||
self.data = f"{self.data}<{tag}{self.tagAttrs}>{escape(self.tagContent)}</{tag}>"
|
||||
elif tag not in ['img', 'video', 'br']:
|
||||
self.data = f"{self.data}{escape(self.tagContent)}"
|
||||
self.tagAttrs = ''
|
||||
|
||||
|
||||
@@ -90,13 +96,19 @@ class RSSParser:
|
||||
if i.nodeName == 'entry':
|
||||
itemList.append(self.__dealItemAtom(i))
|
||||
elif i.nodeName == 'link':
|
||||
if 'href' in i.attributes:
|
||||
typ = 'text/html'
|
||||
if 'type' in i.attributes:
|
||||
typ = i.attributes['type'].nodeValue
|
||||
if 'href' in i.attributes and typ == 'text/html':
|
||||
m[i.nodeName] = i.attributes['href'].nodeValue
|
||||
elif i.nodeName == 'author':
|
||||
if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name':
|
||||
name = i.firstChild
|
||||
if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section':
|
||||
m['author'] = name.firstChild.nodeValue
|
||||
for k in i.childNodes:
|
||||
if k.nodeName == 'name':
|
||||
m['author'] = k.nodeValue
|
||||
break
|
||||
elif len(k.childNodes) == 1 and k.firstChild.nodeName == '#cdata-section':
|
||||
m['author'] = k.firstChild.nodeValue
|
||||
break
|
||||
else:
|
||||
if len(i.childNodes) == 0:
|
||||
m[i.nodeName] = i.nodeValue
|
||||
@@ -115,7 +127,7 @@ class RSSParser:
|
||||
self._type = 'atom'
|
||||
return True
|
||||
|
||||
def __checkasrss3(self):
|
||||
def __checkasrss2(self):
|
||||
self._root = self.xmldoc.documentElement
|
||||
if self._root.localName != 'rss' or len(self._root.childNodes) != 1:
|
||||
return False
|
||||
@@ -147,16 +159,29 @@ class RSSParser:
|
||||
if 'ttl' in m and m['ttl'] is not None and m['ttl'].isnumeric():
|
||||
self.ttl = int(m['ttl'])
|
||||
self.itemList = itemList
|
||||
self._type = 'rss3.0'
|
||||
self._type = 'rss2.0'
|
||||
return True
|
||||
|
||||
def __dealItem(self, node):
|
||||
m = {}
|
||||
for i in node.childNodes:
|
||||
if len(i.childNodes) == 0:
|
||||
if i.nodeName == 'link':
|
||||
if len(i.childNodes) == 0:
|
||||
m[i.nodeName] = i.nodeValue
|
||||
else:
|
||||
m[i.nodeName] = ''
|
||||
for k in i.childNodes:
|
||||
m[i.nodeName] = m[i.nodeName] + k.toxml()
|
||||
break
|
||||
for i in node.childNodes:
|
||||
if i.nodeName == 'link':
|
||||
continue
|
||||
elif len(i.childNodes) == 0:
|
||||
m[i.nodeName] = i.nodeValue
|
||||
elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
|
||||
p = HTMLSimpleParser()
|
||||
if 'link' in m and m['link'] is not None:
|
||||
p.baseUrl = m['link']
|
||||
p.feed(i.firstChild.nodeValue)
|
||||
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
|
||||
m[i.nodeName] = i.firstChild.nodeValue
|
||||
@@ -174,28 +199,95 @@ class RSSParser:
|
||||
def __dealItemAtom(self, node):
|
||||
m = {}
|
||||
for i in node.childNodes:
|
||||
if i.nodeName == 'author':
|
||||
if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name':
|
||||
name = i.firstChild
|
||||
if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section':
|
||||
m['author'] = name.firstChild.nodeValue
|
||||
elif i.nodeName == 'link':
|
||||
if i.nodeName == 'link':
|
||||
if 'href' in i.attributes:
|
||||
m[i.nodeName] = i.attributes['href'].nodeValue
|
||||
for i in node.childNodes:
|
||||
if i.nodeName == 'author':
|
||||
for k in i.childNodes:
|
||||
if k.nodeName == 'name':
|
||||
if k.nodeValue is not None:
|
||||
m['author'] = k.nodeValue
|
||||
break
|
||||
elif len(k.childNodes) == 1 and k.firstChild.nodeName == '#cdata-section':
|
||||
m['author'] = k.firstChild.nodeValue
|
||||
break
|
||||
elif i.nodeName == 'link':
|
||||
continue
|
||||
elif i.nodeName in ['title', 'content', 'summary']:
|
||||
typ = 'text'
|
||||
if 'type' in i.attributes:
|
||||
if i.attributes['type'].nodeValue in ['text', 'html', 'xhtml']:
|
||||
typ = i.attributes['type'].nodeValue
|
||||
if len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
|
||||
p = HTMLSimpleParser()
|
||||
if 'link' in m and m['link'] is not None:
|
||||
p.baseUrl = m['link']
|
||||
p.feed(i.firstChild.nodeValue)
|
||||
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
|
||||
m[i.nodeName] = i.firstChild.nodeValue
|
||||
else:
|
||||
m[i.nodeName] = p.data
|
||||
if i.nodeName in ['content', 'summary']:
|
||||
m['imgList'] = p.imgList
|
||||
m['videoList'] = p.videoList
|
||||
m['description'] = m[i.nodeName]
|
||||
del m[i.nodeName]
|
||||
elif i.nodeValue is None and len(i.childNodes) == 0:
|
||||
continue
|
||||
elif typ == 'text':
|
||||
s = ''
|
||||
if i.nodeValue is not None:
|
||||
s = i.nodeValue
|
||||
else:
|
||||
for k in i.childNodes:
|
||||
s = s + k.toxml()
|
||||
m[i.nodeName] = unescape(s)
|
||||
elif typ == 'html':
|
||||
s = ''
|
||||
if i.nodeValue is not None:
|
||||
s = i.nodeValue
|
||||
else:
|
||||
for k in i.childNodes:
|
||||
s = s + k.toxml()
|
||||
p = HTMLSimpleParser()
|
||||
if 'link' in m and m['link'] is not None:
|
||||
p.baseUrl = m['link']
|
||||
p.feed(unescape(s))
|
||||
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
|
||||
m[i.nodeName] = i.firstChild.nodeValue
|
||||
else:
|
||||
m[i.nodeName] = p.data
|
||||
if i.nodeName in ['content', 'summary']:
|
||||
m['imgList'] = p.imgList
|
||||
m['videoList'] = p.videoList
|
||||
m['description'] = m[i.nodeName]
|
||||
del m[i.nodeName]
|
||||
elif typ == 'xhtml':
|
||||
p = HTMLSimpleParser()
|
||||
if 'link' in m and m['link'] is not None:
|
||||
p.baseUrl = m['link']
|
||||
p.feed(i.firstChild.toxml())
|
||||
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
|
||||
m[i.nodeName] = i.firstChild.nodeValue
|
||||
else:
|
||||
m[i.nodeName] = p.data
|
||||
if i.nodeName in ['content', 'summary']:
|
||||
m['imgList'] = p.imgList
|
||||
m['videoList'] = p.videoList
|
||||
m['description'] = m[i.nodeName]
|
||||
del m[i.nodeName]
|
||||
elif len(i.childNodes) == 0:
|
||||
m[i.nodeName] = i.nodeValue
|
||||
elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
|
||||
p = HTMLSimpleParser()
|
||||
if 'link' in m and m['link'] is not None:
|
||||
p.baseUrl = m['link']
|
||||
p.feed(i.firstChild.nodeValue)
|
||||
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
|
||||
m[i.nodeName] = i.firstChild.nodeValue
|
||||
else:
|
||||
m[i.nodeName] = p.data
|
||||
if i.nodeName == 'content':
|
||||
m['imgList'] = p.imgList
|
||||
m['videoList'] = p.videoList
|
||||
m['description'] = m['content']
|
||||
del m['content']
|
||||
else:
|
||||
m[i.nodeName] = ''
|
||||
for k in i.childNodes:
|
||||
@@ -203,14 +295,15 @@ class RSSParser:
|
||||
return m
|
||||
|
||||
def check(self):
|
||||
try:
|
||||
checked = self.__checkasrss3()
|
||||
if not checked:
|
||||
checked = self.__checkasratom()
|
||||
return checked
|
||||
except:
|
||||
print(format_exc())
|
||||
return False
|
||||
for f in [self.__checkasrss2, self.__checkasratom]:
|
||||
try:
|
||||
if f():
|
||||
self.m['_type'] = self._type
|
||||
return True
|
||||
except:
|
||||
print(format_exc())
|
||||
pass
|
||||
return False
|
||||
|
||||
def normalize(self):
|
||||
self.removeblank(self.xmldoc.documentElement)
|
||||
@@ -243,4 +336,7 @@ if __name__ == "__main__":
|
||||
fn = sys.argv[1]
|
||||
p = RSSParser()
|
||||
p.parse(fn)
|
||||
p.check()
|
||||
if p.check():
|
||||
print(p._type)
|
||||
else:
|
||||
print('解析失败')
|
||||
|
||||
Reference in New Issue
Block a user