This commit is contained in:
2021-01-07 14:17:02 +08:00
parent bef75562ae
commit 7d0aa5abc2
6 changed files with 289 additions and 58 deletions

View File

@@ -15,6 +15,9 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from json import loads
from config import RSSConfig
from time import time_ns
from typing import List
from hashl import sha256WithBase64
class ChatEntry:
@@ -28,8 +31,66 @@ class ChatEntry:
self.config = RSSConfig()
class HashEntry:
def __init__(self, data=None, id: str = None, hash: str = None):
self.id = data[0] if data is not None and data[0] is not None else None
self.hash = data[1] if data is not None and data[1] is not None else None
self.time = data[2] if data is not None and data[2] is not None else time_ns()
if id is not None:
self.id = id
if hash is not None:
self.hash = hash
def calHash(url: dict, item: dict) -> HashEntry:
hashd = sha256WithBase64(url)
hasht = url
if 'title' in item and item['title'] is not None:
hasht = hasht + item['title']
if 'link' in item and item['link'] is not None:
hasht = hasht + item['link']
if 'description' in item and item['description'] is not None:
hasht = hasht + item['description']
hashed = sha256WithBase64(hasht)
return HashEntry(id=hashd, hash=hashed)
class HashEntries:
def __init__(self, maxCount: int = 100):
self.__list = []
self.__maxCount = maxCount if maxCount is not None and maxCount >= 1 else 100
def __removeMax(self):
self.__sort()
while len(self.__list) > self.__maxCount:
t = self.__list[0]
self.__list.remove(t)
def __sort(self, reverse: bool = False):
self.__list.sort(key=lambda d: d.time, reverse=reverse)
def add(self, d: HashEntry):
if d.hash is not None and d.id is not None:
for v in self.__list:
if v.hash == d.hash and v.id == d.id:
return
self.__list.append(d)
self.__removeMax()
def getList(self) -> List[HashEntry]:
self.__removeMax()
r = []
for i in self.__list:
r.append(i)
return r
def setMaxCount(self, maxCount: int):
self.__maxCount = maxCount if maxCount >= 1 else 100
self.__removeMax()
class RSSEntry:
def __init__(self, data=None):
def __init__(self, data=None, maxCount: int = 100):
self.title = None
if data is not None and data[0] is not None:
self.title = data[0]
@@ -46,3 +107,4 @@ class RSSEntry:
if data is not None and data[4] is not None:
self.id = data[4]
self.chatList = []
self.hashList = HashEntries(maxCount)

View File

@@ -21,10 +21,12 @@ class RSSConfig:
self.disable_web_page_preview = False
self.show_RSS_title = True
self.show_Content_title = True
self.show_content = True
self.send_media = True
if d is not None:
for k in d.keys():
if hasattr(self, k):
setattr(self, k, d[k])
def toJson(self):
return dumps({'disable_web_page_preview': self.disable_web_page_preview, 'show_RSS_title': self.show_RSS_title, 'show_Content_title': self.show_Content_title}, ensure_ascii=False)
return dumps({'disable_web_page_preview': self.disable_web_page_preview, 'show_RSS_title': self.show_RSS_title, 'show_Content_title': self.show_Content_title, 'show_content': self.show_content, 'send_media': self.send_media}, ensure_ascii=False)

View File

@@ -15,7 +15,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import sqlite3
from config import RSSConfig
from RSSEntry import RSSEntry, ChatEntry
from RSSEntry import RSSEntry, ChatEntry, HashEntry, HashEntries
from typing import List
from enum import Enum, unique
from threading import Lock
@@ -84,13 +84,15 @@ PRIMARY KEY (hash)
)''')
self._db.commit()
def __init__(self):
def __init__(self, m):
self._version = [1, 0, 0, 0]
self._value_lock = Lock()
self._db = sqlite3.connect('data.db', check_same_thread=False)
ok = self.__check_database()
if not ok:
self.__create_table()
from rssbot import main
self._main: main = m
def __removeRSSEntry(self, id: str) -> bool:
try:
@@ -106,7 +108,7 @@ PRIMARY KEY (hash)
f'INSERT INTO config VALUES ({self._version[0]}, {self._version[1]}, {self._version[2]}, {self._version[3]});')
self._db.commit()
def addRSSList(self, title: str, url: str, chatId: int, config: RSSConfig, ttl: int = None):
def addRSSList(self, title: str, url: str, chatId: int, config: RSSConfig, ttl: int = None, hashEntries: HashEntries = None):
with self._value_lock:
try:
hashd = sha256WithBase64(url)
@@ -115,20 +117,37 @@ PRIMARY KEY (hash)
has_data = False
for i in cur:
has_data = True
break
if has_data:
self._db.execute(f'DELETE FROM RSSList WHERE id="{hashd}"')
self._db.execute(
f"INSERT INTO RSSList VALUES ('{dealtext(title)}', '{dealtext(url)}', {ttl if ttl is not None else 'null'}, null, '{hashd}')")
self._db.execute(
f"UPDATE RSSList SET title='{dealtext(title)}', ttl={ttl if ttl is not None else 'null'} WHERE id='{hashd}'")
else:
self._db.execute(
f"INSERT INTO RSSList VALUES ('{dealtext(title)}', '{dealtext(url)}', {ttl if ttl is not None else 'null'}, null, '{hashd}')")
cur = self._db.execute(
f'SELECT * FROM chatList WHERE id="{hashd}" AND chatId={chatId}')
has_data = False
has_data2 = False
for i in cur:
has_data = True
if has_data:
has_data2 = True
break
if has_data2:
self._db.execute(
f'DELETE FROM chatList WHERE id="{hashd}" AND chatId={chatId}')
self._db.execute(
f"INSERT INTO chatList VALUES ({chatId}, '{hashd}', '{dealtext(config.toJson())}')")
if hashEntries is not None and not has_data:
cur = self._db.execute(
f"SELECT * FROM hashList WHERE id='{hashd}'")
has_data3 = False
for i in cur:
has_data3 = True
break
if has_data3:
self._db.execute(
f"DELETE FROM hashList WHERE ID='{hashd}'")
for v in hashEntries.getList():
self._db.execute(
f"INSERT INTO hashList VALUES ('{v.id}', '{v.hash}', {v.time})")
self._db.commit()
return True
except:
@@ -139,12 +158,16 @@ PRIMARY KEY (hash)
cur = self._db.execute(f'SELECT * FROM RSSList;')
r = []
for i in cur:
temp = RSSEntry(i)
temp = RSSEntry(i, self._main._setting._maxCount)
cur2 = self._db.execute(
f'SELECT * FROM chatList WHERE id="{temp.id}"')
for i2 in cur2:
temp2 = ChatEntry(i2)
temp.chatList.append(temp2)
cur3 = self._db.execute(
f"SELECT * FROM hashList WHERE id='{temp.id}' ORDER BY time")
for i3 in cur3:
temp.hashList.add(HashEntry(i3))
if len(temp.chatList) == 0:
self.__removeRSSEntry(temp.id)
else:

View File

@@ -27,3 +27,4 @@ class settings:
if len(l) == 2:
d[l[0]] = l[1]
self._token = d['token'] if 'token' in d else None
self._maxCount = int(d['maxCount']) if 'maxCount' in d and d['maxCount'].isnumeric() else 100

View File

@@ -14,6 +14,7 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from database import database, userStatus, RSSConfig
from RSSEntry import HashEntry, HashEntries, calHash
from os.path import exists
from readset import settings
from requests import Session
@@ -46,10 +47,14 @@ def getMediaInfo(m: dict, config: RSSConfig = RSSConfig()) -> str:
s = f"""{s}\n群/频道ID:{m['chatId']}"""
elif 'userId' in m and m['userId'] is not None:
s = f"""{s}\n<a href="tg://user?id={m['userId']}">订阅的账号</a>"""
if '_type' in m and m['_type'] is not None:
s = f"""{s}\n类型:{m['_type']}"""
s = f"{s}\n设置:"
s = f"{s}\n禁用预览:{config.disable_web_page_preview}"
s = f"{s}\n显示RSS标题:{config.show_RSS_title}"
s = f"{s}\n显示内容标题:{config.show_Content_title}"
s = f"{s}\n显示内容:{config.show_content}"
s = f"{s}\n发送媒体:{config.send_media}"
return s
@@ -63,6 +68,8 @@ class InlineKeyBoardCallBack(Enum):
DisableWebPagePreview = 6
ShowRSSTitle = 7
ShowContentTitle = 8
ShowContent = 9
SendMedia = 10
def getInlineKeyBoardWhenRSS(hashd: str, m: dict) -> str:
@@ -108,6 +115,14 @@ def getInlineKeyBoardWhenRSS2(hashd: str, config: RSSConfig) -> str:
temp = '隐藏内容标题' if config.show_Content_title else '显示内容标题'
d[i].append(
{'text': temp, 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.ShowContentTitle.value}'})
temp = '隐藏内容' if config.show_content else '显示内容'
d[i].append(
{'text': temp, 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.ShowContent.value}'})
d.append([])
i = i + 1
temp = '禁用发送媒体' if config.send_media else '启用发送媒体'
d[i].append(
{'text': temp, 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.SendMedia.value}'})
d[i].append(
{'text': '返回', 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.BackToNormalPage.value}'})
return {'inline_keyboard': d}
@@ -148,14 +163,14 @@ class main:
elif 'link' in content and content['link'] is not None and content['link'] != '':
text.addtotext(
f"""<a href="{content['link']}">{escape(content['link'])}</a>""")
if 'description' in content and content['description'] is not None and content['description'] != '':
if config.show_content and 'description' in content and content['description'] is not None and content['description'] != '':
text.addtotext(content['description'])
def getListCount(content: dict, key: str):
if key not in content and content[key] is None:
if key not in content or content[key] is None:
return 0
return len(content[key])
if getListCount(content, 'imgList') == 0 and getListCount(content, 'videoList') == 0:
if not config.send_media or (getListCount(content, 'imgList') == 0 and getListCount(content, 'videoList') == 0):
if config.disable_web_page_preview:
di['disable_web_page_preview'] = True
di['text'] = text.tostr()
@@ -218,14 +233,14 @@ class main:
self._upi = i['update_id'] + 1
def start(self):
self._db = database()
if not exists('settings.txt'):
print('找不到settings.txt')
return -1
self._setting = settings('settings.txt')
if self._setting._token is None:
print('没有机器人token')
return -1
self._db = database(self)
if not exists('settings.txt'):
print('找不到settings.txt')
return -1
self._r = Session()
self._me = self._request('getMe')
self._rssMetaList = rssMetaList()
@@ -390,9 +405,9 @@ class messageHandle(Thread):
continue
if chatMember['status'] not in ['creator', 'administrator']:
continue
if re2['type'] == 'channel' and ('can_post_messages' not in chatMember or not chatMember['can_post_messages']):
if re2['type'] == 'channel' and chatMember['status'] == 'administrator' and ('can_post_messages' not in chatMember or not chatMember['can_post_messages']):
continue
if re2['type'] == 'channel' and ('can_edit_messages' not in chatMember or not chatMember['can_edit_messages']):
if re2['type'] == 'channel' and chatMember['status'] == 'administrator' and ('can_edit_messages' not in chatMember or not chatMember['can_edit_messages']):
continue
chatM = chatMember
if chatM is None:
@@ -564,7 +579,13 @@ class callbackQueryHandle(Thread):
return
config = self._rssMeta.config
ttl = self._rssMeta.meta['ttl'] if 'ttl' in self._rssMeta.meta else None
suc = self._main._db.addRSSList(title, url, chatId, config, ttl)
hashEntries = HashEntries(self._main._setting._maxCount)
tempList = self._rssMeta.itemList.copy()
tempList.reverse()
for v in tempList[-100:]:
hashEntries.add(calHash(url, v))
suc = self._main._db.addRSSList(
title, url, chatId, config, ttl, hashEntries)
if suc:
self.answer('订阅成功!')
else:
@@ -677,6 +698,32 @@ class callbackQueryHandle(Thread):
self._main._request("editMessageText", "post", json=di)
self.answer()
return
elif self._inlineKeyBoardCommand == InlineKeyBoardCallBack.ShowContent:
self._rssMeta.config.show_content = not self._rssMeta.config.show_content
di = {'chat_id': self._rssMeta.chatId,
'message_id': self._rssMeta.messageId}
di['text'] = getMediaInfo(
self._rssMeta.meta, self._rssMeta.config)
di['parse_mode'] = 'HTML'
di['disable_web_page_preview'] = True
di['reply_markup'] = getInlineKeyBoardWhenRSS2(
self._hashd, self._rssMeta.config)
self._main._request("editMessageText", "post", json=di)
self.answer()
return
elif self._inlineKeyBoardCommand == InlineKeyBoardCallBack.SendMedia:
self._rssMeta.config.send_media = not self._rssMeta.config.send_media
di = {'chat_id': self._rssMeta.chatId,
'message_id': self._rssMeta.messageId}
di['text'] = getMediaInfo(
self._rssMeta.meta, self._rssMeta.config)
di['parse_mode'] = 'HTML'
di['disable_web_page_preview'] = True
di['reply_markup'] = getInlineKeyBoardWhenRSS2(
self._hashd, self._rssMeta.config)
self._main._request("editMessageText", "post", json=di)
self.answer()
return
else:
self.answer('未知的按钮。')
return

View File

@@ -15,20 +15,24 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from xml.dom import minidom
from html.parser import HTMLParser
from html import escape
from html import escape, unescape
import sys
import requests
from traceback import format_exc
from urllib.parse import urljoin
class HTMLSimpleParser(HTMLParser):
def __init__(self):
def __init__(self, baseUrl: str=None):
self.data = ''
self.istag = False
self.tagContent = ''
self.tagAttrs = ''
self.imgList = []
self.videoList = []
self.baseUrl = ''
if baseUrl is not None:
self.baseUrl = baseUrl
HTMLParser.__init__(self)
def handle_startendtag(self, tag, attrs):
@@ -42,16 +46,16 @@ class HTMLSimpleParser(HTMLParser):
elif tag == 'img':
for key, value in attrs:
if key == 'src':
self.imgList.append(value)
self.imgList.append(urljoin(self.baseUrl, value))
break
return
elif tag == 'video':
p = {}
for key, value in attrs:
if key == 'src':
p['src'] = value
p['src'] = urljoin(self.baseUrl, value)
if key == 'poster':
p['poster'] = value
p['poster'] = urljoin(self.baseUrl, value)
if 'src' in p:
self.videoList.append(p)
return
@@ -61,18 +65,20 @@ class HTMLSimpleParser(HTMLParser):
if tag == 'a':
for key, value in attrs:
if key == 'href':
self.tagAttrs = f'{self.tagAttrs} href="{value}"'
self.tagAttrs = f'{self.tagAttrs} href="{urljoin(self.baseUrl, value)}"'
def handle_data(self, data):
if self.istag:
self.tagContent = self.tagContent + data
else:
self.data = self.data + data
self.data = self.data + escape(data)
def handle_endtag(self, tag):
self.istag = False
if tag in ['a', 'b', 'i', 'u', 's', 'strong', 'em', 'ins', 'strike', 'del', 'code', 'pre']:
self.data = f"{self.data}<{tag}{self.tagAttrs}>{self.tagContent}</{tag}>"
self.data = f"{self.data}<{tag}{self.tagAttrs}>{escape(self.tagContent)}</{tag}>"
elif tag not in ['img', 'video', 'br']:
self.data = f"{self.data}{escape(self.tagContent)}"
self.tagAttrs = ''
@@ -90,13 +96,19 @@ class RSSParser:
if i.nodeName == 'entry':
itemList.append(self.__dealItemAtom(i))
elif i.nodeName == 'link':
if 'href' in i.attributes:
typ = 'text/html'
if 'type' in i.attributes:
typ = i.attributes['type'].nodeValue
if 'href' in i.attributes and typ == 'text/html':
m[i.nodeName] = i.attributes['href'].nodeValue
elif i.nodeName == 'author':
if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name':
name = i.firstChild
if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section':
m['author'] = name.firstChild.nodeValue
for k in i.childNodes:
if k.nodeName == 'name':
m['author'] = k.nodeValue
break
elif len(k.childNodes) == 1 and k.firstChild.nodeName == '#cdata-section':
m['author'] = k.firstChild.nodeValue
break
else:
if len(i.childNodes) == 0:
m[i.nodeName] = i.nodeValue
@@ -115,7 +127,7 @@ class RSSParser:
self._type = 'atom'
return True
def __checkasrss3(self):
def __checkasrss2(self):
self._root = self.xmldoc.documentElement
if self._root.localName != 'rss' or len(self._root.childNodes) != 1:
return False
@@ -147,16 +159,29 @@ class RSSParser:
if 'ttl' in m and m['ttl'] is not None and m['ttl'].isnumeric():
self.ttl = int(m['ttl'])
self.itemList = itemList
self._type = 'rss3.0'
self._type = 'rss2.0'
return True
def __dealItem(self, node):
m = {}
for i in node.childNodes:
if len(i.childNodes) == 0:
if i.nodeName == 'link':
if len(i.childNodes) == 0:
m[i.nodeName] = i.nodeValue
else:
m[i.nodeName] = ''
for k in i.childNodes:
m[i.nodeName] = m[i.nodeName] + k.toxml()
break
for i in node.childNodes:
if i.nodeName == 'link':
continue
elif len(i.childNodes) == 0:
m[i.nodeName] = i.nodeValue
elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
p = HTMLSimpleParser()
if 'link' in m and m['link'] is not None:
p.baseUrl = m['link']
p.feed(i.firstChild.nodeValue)
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
m[i.nodeName] = i.firstChild.nodeValue
@@ -174,28 +199,95 @@ class RSSParser:
def __dealItemAtom(self, node):
m = {}
for i in node.childNodes:
if i.nodeName == 'author':
if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name':
name = i.firstChild
if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section':
m['author'] = name.firstChild.nodeValue
elif i.nodeName == 'link':
if i.nodeName == 'link':
if 'href' in i.attributes:
m[i.nodeName] = i.attributes['href'].nodeValue
for i in node.childNodes:
if i.nodeName == 'author':
for k in i.childNodes:
if k.nodeName == 'name':
if k.nodeValue is not None:
m['author'] = k.nodeValue
break
elif len(k.childNodes) == 1 and k.firstChild.nodeName == '#cdata-section':
m['author'] = k.firstChild.nodeValue
break
elif i.nodeName == 'link':
continue
elif i.nodeName in ['title', 'content', 'summary']:
typ = 'text'
if 'type' in i.attributes:
if i.attributes['type'].nodeValue in ['text', 'html', 'xhtml']:
typ = i.attributes['type'].nodeValue
if len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
p = HTMLSimpleParser()
if 'link' in m and m['link'] is not None:
p.baseUrl = m['link']
p.feed(i.firstChild.nodeValue)
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
m[i.nodeName] = i.firstChild.nodeValue
else:
m[i.nodeName] = p.data
if i.nodeName in ['content', 'summary']:
m['imgList'] = p.imgList
m['videoList'] = p.videoList
m['description'] = m[i.nodeName]
del m[i.nodeName]
elif i.nodeValue is None and len(i.childNodes) == 0:
continue
elif typ == 'text':
s = ''
if i.nodeValue is not None:
s = i.nodeValue
else:
for k in i.childNodes:
s = s + k.toxml()
m[i.nodeName] = unescape(s)
elif typ == 'html':
s = ''
if i.nodeValue is not None:
s = i.nodeValue
else:
for k in i.childNodes:
s = s + k.toxml()
p = HTMLSimpleParser()
if 'link' in m and m['link'] is not None:
p.baseUrl = m['link']
p.feed(unescape(s))
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
m[i.nodeName] = i.firstChild.nodeValue
else:
m[i.nodeName] = p.data
if i.nodeName in ['content', 'summary']:
m['imgList'] = p.imgList
m['videoList'] = p.videoList
m['description'] = m[i.nodeName]
del m[i.nodeName]
elif typ == 'xhtml':
p = HTMLSimpleParser()
if 'link' in m and m['link'] is not None:
p.baseUrl = m['link']
p.feed(i.firstChild.toxml())
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
m[i.nodeName] = i.firstChild.nodeValue
else:
m[i.nodeName] = p.data
if i.nodeName in ['content', 'summary']:
m['imgList'] = p.imgList
m['videoList'] = p.videoList
m['description'] = m[i.nodeName]
del m[i.nodeName]
elif len(i.childNodes) == 0:
m[i.nodeName] = i.nodeValue
elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section':
p = HTMLSimpleParser()
if 'link' in m and m['link'] is not None:
p.baseUrl = m['link']
p.feed(i.firstChild.nodeValue)
if p.data == '' and i.firstChild.nodeValue.find('<') == -1:
m[i.nodeName] = i.firstChild.nodeValue
else:
m[i.nodeName] = p.data
if i.nodeName == 'content':
m['imgList'] = p.imgList
m['videoList'] = p.videoList
m['description'] = m['content']
del m['content']
else:
m[i.nodeName] = ''
for k in i.childNodes:
@@ -203,14 +295,15 @@ class RSSParser:
return m
def check(self):
try:
checked = self.__checkasrss3()
if not checked:
checked = self.__checkasratom()
return checked
except:
print(format_exc())
return False
for f in [self.__checkasrss2, self.__checkasratom]:
try:
if f():
self.m['_type'] = self._type
return True
except:
print(format_exc())
pass
return False
def normalize(self):
self.removeblank(self.xmldoc.documentElement)
@@ -243,4 +336,7 @@ if __name__ == "__main__":
fn = sys.argv[1]
p = RSSParser()
p.parse(fn)
p.check()
if p.check():
print(p._type)
else:
print('解析失败')