From 7d0aa5abc2d215ccc58cca328534c25000bf07d9 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Thu, 7 Jan 2021 14:17:02 +0800 Subject: [PATCH] update --- RSSEntry.py | 64 +++++++++++++++++++- config.py | 4 +- database.py | 43 ++++++++++--- readset.py | 1 + rssbot.py | 67 +++++++++++++++++--- rssparser.py | 168 ++++++++++++++++++++++++++++++++++++++++----------- 6 files changed, 289 insertions(+), 58 deletions(-) diff --git a/RSSEntry.py b/RSSEntry.py index 91a4007..872af4c 100644 --- a/RSSEntry.py +++ b/RSSEntry.py @@ -15,6 +15,9 @@ # along with this program. If not, see . from json import loads from config import RSSConfig +from time import time_ns +from typing import List +from hashl import sha256WithBase64 class ChatEntry: @@ -28,8 +31,66 @@ class ChatEntry: self.config = RSSConfig() +class HashEntry: + def __init__(self, data=None, id: str = None, hash: str = None): + self.id = data[0] if data is not None and data[0] is not None else None + self.hash = data[1] if data is not None and data[1] is not None else None + self.time = data[2] if data is not None and data[2] is not None else time_ns() + if id is not None: + self.id = id + if hash is not None: + self.hash = hash + + +def calHash(url: dict, item: dict) -> HashEntry: + hashd = sha256WithBase64(url) + hasht = url + if 'title' in item and item['title'] is not None: + hasht = hasht + item['title'] + if 'link' in item and item['link'] is not None: + hasht = hasht + item['link'] + if 'description' in item and item['description'] is not None: + hasht = hasht + item['description'] + hashed = sha256WithBase64(hasht) + return HashEntry(id=hashd, hash=hashed) + + +class HashEntries: + def __init__(self, maxCount: int = 100): + self.__list = [] + self.__maxCount = maxCount if maxCount is not None and maxCount >= 1 else 100 + + def __removeMax(self): + self.__sort() + while len(self.__list) > self.__maxCount: + t = self.__list[0] + self.__list.remove(t) + + def __sort(self, reverse: bool = False): + self.__list.sort(key=lambda d: d.time, reverse=reverse) + + def add(self, d: HashEntry): + if d.hash is not None and d.id is not None: + for v in self.__list: + if v.hash == d.hash and v.id == d.id: + return + self.__list.append(d) + self.__removeMax() + + def getList(self) -> List[HashEntry]: + self.__removeMax() + r = [] + for i in self.__list: + r.append(i) + return r + + def setMaxCount(self, maxCount: int): + self.__maxCount = maxCount if maxCount >= 1 else 100 + self.__removeMax() + + class RSSEntry: - def __init__(self, data=None): + def __init__(self, data=None, maxCount: int = 100): self.title = None if data is not None and data[0] is not None: self.title = data[0] @@ -46,3 +107,4 @@ class RSSEntry: if data is not None and data[4] is not None: self.id = data[4] self.chatList = [] + self.hashList = HashEntries(maxCount) diff --git a/config.py b/config.py index 3b95a19..5ca7780 100644 --- a/config.py +++ b/config.py @@ -21,10 +21,12 @@ class RSSConfig: self.disable_web_page_preview = False self.show_RSS_title = True self.show_Content_title = True + self.show_content = True + self.send_media = True if d is not None: for k in d.keys(): if hasattr(self, k): setattr(self, k, d[k]) def toJson(self): - return dumps({'disable_web_page_preview': self.disable_web_page_preview, 'show_RSS_title': self.show_RSS_title, 'show_Content_title': self.show_Content_title}, ensure_ascii=False) + return dumps({'disable_web_page_preview': self.disable_web_page_preview, 'show_RSS_title': self.show_RSS_title, 'show_Content_title': self.show_Content_title, 'show_content': self.show_content, 'send_media': self.send_media}, ensure_ascii=False) diff --git a/database.py b/database.py index 20715af..28a3a77 100644 --- a/database.py +++ b/database.py @@ -15,7 +15,7 @@ # along with this program. If not, see . import sqlite3 from config import RSSConfig -from RSSEntry import RSSEntry, ChatEntry +from RSSEntry import RSSEntry, ChatEntry, HashEntry, HashEntries from typing import List from enum import Enum, unique from threading import Lock @@ -84,13 +84,15 @@ PRIMARY KEY (hash) )''') self._db.commit() - def __init__(self): + def __init__(self, m): self._version = [1, 0, 0, 0] self._value_lock = Lock() self._db = sqlite3.connect('data.db', check_same_thread=False) ok = self.__check_database() if not ok: self.__create_table() + from rssbot import main + self._main: main = m def __removeRSSEntry(self, id: str) -> bool: try: @@ -106,7 +108,7 @@ PRIMARY KEY (hash) f'INSERT INTO config VALUES ({self._version[0]}, {self._version[1]}, {self._version[2]}, {self._version[3]});') self._db.commit() - def addRSSList(self, title: str, url: str, chatId: int, config: RSSConfig, ttl: int = None): + def addRSSList(self, title: str, url: str, chatId: int, config: RSSConfig, ttl: int = None, hashEntries: HashEntries = None): with self._value_lock: try: hashd = sha256WithBase64(url) @@ -115,20 +117,37 @@ PRIMARY KEY (hash) has_data = False for i in cur: has_data = True + break if has_data: - self._db.execute(f'DELETE FROM RSSList WHERE id="{hashd}"') - self._db.execute( - f"INSERT INTO RSSList VALUES ('{dealtext(title)}', '{dealtext(url)}', {ttl if ttl is not None else 'null'}, null, '{hashd}')") + self._db.execute( + f"UPDATE RSSList SET title='{dealtext(title)}', ttl={ttl if ttl is not None else 'null'} WHERE id='{hashd}'") + else: + self._db.execute( + f"INSERT INTO RSSList VALUES ('{dealtext(title)}', '{dealtext(url)}', {ttl if ttl is not None else 'null'}, null, '{hashd}')") cur = self._db.execute( f'SELECT * FROM chatList WHERE id="{hashd}" AND chatId={chatId}') - has_data = False + has_data2 = False for i in cur: - has_data = True - if has_data: + has_data2 = True + break + if has_data2: self._db.execute( f'DELETE FROM chatList WHERE id="{hashd}" AND chatId={chatId}') self._db.execute( f"INSERT INTO chatList VALUES ({chatId}, '{hashd}', '{dealtext(config.toJson())}')") + if hashEntries is not None and not has_data: + cur = self._db.execute( + f"SELECT * FROM hashList WHERE id='{hashd}'") + has_data3 = False + for i in cur: + has_data3 = True + break + if has_data3: + self._db.execute( + f"DELETE FROM hashList WHERE ID='{hashd}'") + for v in hashEntries.getList(): + self._db.execute( + f"INSERT INTO hashList VALUES ('{v.id}', '{v.hash}', {v.time})") self._db.commit() return True except: @@ -139,12 +158,16 @@ PRIMARY KEY (hash) cur = self._db.execute(f'SELECT * FROM RSSList;') r = [] for i in cur: - temp = RSSEntry(i) + temp = RSSEntry(i, self._main._setting._maxCount) cur2 = self._db.execute( f'SELECT * FROM chatList WHERE id="{temp.id}"') for i2 in cur2: temp2 = ChatEntry(i2) temp.chatList.append(temp2) + cur3 = self._db.execute( + f"SELECT * FROM hashList WHERE id='{temp.id}' ORDER BY time") + for i3 in cur3: + temp.hashList.add(HashEntry(i3)) if len(temp.chatList) == 0: self.__removeRSSEntry(temp.id) else: diff --git a/readset.py b/readset.py index 72b6982..864feba 100644 --- a/readset.py +++ b/readset.py @@ -27,3 +27,4 @@ class settings: if len(l) == 2: d[l[0]] = l[1] self._token = d['token'] if 'token' in d else None + self._maxCount = int(d['maxCount']) if 'maxCount' in d and d['maxCount'].isnumeric() else 100 diff --git a/rssbot.py b/rssbot.py index 95cdf0c..072f6dc 100644 --- a/rssbot.py +++ b/rssbot.py @@ -14,6 +14,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from database import database, userStatus, RSSConfig +from RSSEntry import HashEntry, HashEntries, calHash from os.path import exists from readset import settings from requests import Session @@ -46,10 +47,14 @@ def getMediaInfo(m: dict, config: RSSConfig = RSSConfig()) -> str: s = f"""{s}\n群/频道ID:{m['chatId']}""" elif 'userId' in m and m['userId'] is not None: s = f"""{s}\n订阅的账号""" + if '_type' in m and m['_type'] is not None: + s = f"""{s}\n类型:{m['_type']}""" s = f"{s}\n设置:" s = f"{s}\n禁用预览:{config.disable_web_page_preview}" s = f"{s}\n显示RSS标题:{config.show_RSS_title}" s = f"{s}\n显示内容标题:{config.show_Content_title}" + s = f"{s}\n显示内容:{config.show_content}" + s = f"{s}\n发送媒体:{config.send_media}" return s @@ -63,6 +68,8 @@ class InlineKeyBoardCallBack(Enum): DisableWebPagePreview = 6 ShowRSSTitle = 7 ShowContentTitle = 8 + ShowContent = 9 + SendMedia = 10 def getInlineKeyBoardWhenRSS(hashd: str, m: dict) -> str: @@ -108,6 +115,14 @@ def getInlineKeyBoardWhenRSS2(hashd: str, config: RSSConfig) -> str: temp = '隐藏内容标题' if config.show_Content_title else '显示内容标题' d[i].append( {'text': temp, 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.ShowContentTitle.value}'}) + temp = '隐藏内容' if config.show_content else '显示内容' + d[i].append( + {'text': temp, 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.ShowContent.value}'}) + d.append([]) + i = i + 1 + temp = '禁用发送媒体' if config.send_media else '启用发送媒体' + d[i].append( + {'text': temp, 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.SendMedia.value}'}) d[i].append( {'text': '返回', 'callback_data': f'0,{hashd},{InlineKeyBoardCallBack.BackToNormalPage.value}'}) return {'inline_keyboard': d} @@ -148,14 +163,14 @@ class main: elif 'link' in content and content['link'] is not None and content['link'] != '': text.addtotext( f"""{escape(content['link'])}""") - if 'description' in content and content['description'] is not None and content['description'] != '': + if config.show_content and 'description' in content and content['description'] is not None and content['description'] != '': text.addtotext(content['description']) def getListCount(content: dict, key: str): - if key not in content and content[key] is None: + if key not in content or content[key] is None: return 0 return len(content[key]) - if getListCount(content, 'imgList') == 0 and getListCount(content, 'videoList') == 0: + if not config.send_media or (getListCount(content, 'imgList') == 0 and getListCount(content, 'videoList') == 0): if config.disable_web_page_preview: di['disable_web_page_preview'] = True di['text'] = text.tostr() @@ -218,14 +233,14 @@ class main: self._upi = i['update_id'] + 1 def start(self): - self._db = database() - if not exists('settings.txt'): - print('找不到settings.txt') - return -1 self._setting = settings('settings.txt') if self._setting._token is None: print('没有机器人token') return -1 + self._db = database(self) + if not exists('settings.txt'): + print('找不到settings.txt') + return -1 self._r = Session() self._me = self._request('getMe') self._rssMetaList = rssMetaList() @@ -390,9 +405,9 @@ class messageHandle(Thread): continue if chatMember['status'] not in ['creator', 'administrator']: continue - if re2['type'] == 'channel' and ('can_post_messages' not in chatMember or not chatMember['can_post_messages']): + if re2['type'] == 'channel' and chatMember['status'] == 'administrator' and ('can_post_messages' not in chatMember or not chatMember['can_post_messages']): continue - if re2['type'] == 'channel' and ('can_edit_messages' not in chatMember or not chatMember['can_edit_messages']): + if re2['type'] == 'channel' and chatMember['status'] == 'administrator' and ('can_edit_messages' not in chatMember or not chatMember['can_edit_messages']): continue chatM = chatMember if chatM is None: @@ -564,7 +579,13 @@ class callbackQueryHandle(Thread): return config = self._rssMeta.config ttl = self._rssMeta.meta['ttl'] if 'ttl' in self._rssMeta.meta else None - suc = self._main._db.addRSSList(title, url, chatId, config, ttl) + hashEntries = HashEntries(self._main._setting._maxCount) + tempList = self._rssMeta.itemList.copy() + tempList.reverse() + for v in tempList[-100:]: + hashEntries.add(calHash(url, v)) + suc = self._main._db.addRSSList( + title, url, chatId, config, ttl, hashEntries) if suc: self.answer('订阅成功!') else: @@ -677,6 +698,32 @@ class callbackQueryHandle(Thread): self._main._request("editMessageText", "post", json=di) self.answer() return + elif self._inlineKeyBoardCommand == InlineKeyBoardCallBack.ShowContent: + self._rssMeta.config.show_content = not self._rssMeta.config.show_content + di = {'chat_id': self._rssMeta.chatId, + 'message_id': self._rssMeta.messageId} + di['text'] = getMediaInfo( + self._rssMeta.meta, self._rssMeta.config) + di['parse_mode'] = 'HTML' + di['disable_web_page_preview'] = True + di['reply_markup'] = getInlineKeyBoardWhenRSS2( + self._hashd, self._rssMeta.config) + self._main._request("editMessageText", "post", json=di) + self.answer() + return + elif self._inlineKeyBoardCommand == InlineKeyBoardCallBack.SendMedia: + self._rssMeta.config.send_media = not self._rssMeta.config.send_media + di = {'chat_id': self._rssMeta.chatId, + 'message_id': self._rssMeta.messageId} + di['text'] = getMediaInfo( + self._rssMeta.meta, self._rssMeta.config) + di['parse_mode'] = 'HTML' + di['disable_web_page_preview'] = True + di['reply_markup'] = getInlineKeyBoardWhenRSS2( + self._hashd, self._rssMeta.config) + self._main._request("editMessageText", "post", json=di) + self.answer() + return else: self.answer('未知的按钮。') return diff --git a/rssparser.py b/rssparser.py index 93da76f..a072bc5 100644 --- a/rssparser.py +++ b/rssparser.py @@ -15,20 +15,24 @@ # along with this program. If not, see . from xml.dom import minidom from html.parser import HTMLParser -from html import escape +from html import escape, unescape import sys import requests from traceback import format_exc +from urllib.parse import urljoin class HTMLSimpleParser(HTMLParser): - def __init__(self): + def __init__(self, baseUrl: str=None): self.data = '' self.istag = False self.tagContent = '' self.tagAttrs = '' self.imgList = [] self.videoList = [] + self.baseUrl = '' + if baseUrl is not None: + self.baseUrl = baseUrl HTMLParser.__init__(self) def handle_startendtag(self, tag, attrs): @@ -42,16 +46,16 @@ class HTMLSimpleParser(HTMLParser): elif tag == 'img': for key, value in attrs: if key == 'src': - self.imgList.append(value) + self.imgList.append(urljoin(self.baseUrl, value)) break return elif tag == 'video': p = {} for key, value in attrs: if key == 'src': - p['src'] = value + p['src'] = urljoin(self.baseUrl, value) if key == 'poster': - p['poster'] = value + p['poster'] = urljoin(self.baseUrl, value) if 'src' in p: self.videoList.append(p) return @@ -61,18 +65,20 @@ class HTMLSimpleParser(HTMLParser): if tag == 'a': for key, value in attrs: if key == 'href': - self.tagAttrs = f'{self.tagAttrs} href="{value}"' + self.tagAttrs = f'{self.tagAttrs} href="{urljoin(self.baseUrl, value)}"' def handle_data(self, data): if self.istag: self.tagContent = self.tagContent + data else: - self.data = self.data + data + self.data = self.data + escape(data) def handle_endtag(self, tag): self.istag = False if tag in ['a', 'b', 'i', 'u', 's', 'strong', 'em', 'ins', 'strike', 'del', 'code', 'pre']: - self.data = f"{self.data}<{tag}{self.tagAttrs}>{self.tagContent}" + self.data = f"{self.data}<{tag}{self.tagAttrs}>{escape(self.tagContent)}" + elif tag not in ['img', 'video', 'br']: + self.data = f"{self.data}{escape(self.tagContent)}" self.tagAttrs = '' @@ -90,13 +96,19 @@ class RSSParser: if i.nodeName == 'entry': itemList.append(self.__dealItemAtom(i)) elif i.nodeName == 'link': - if 'href' in i.attributes: + typ = 'text/html' + if 'type' in i.attributes: + typ = i.attributes['type'].nodeValue + if 'href' in i.attributes and typ == 'text/html': m[i.nodeName] = i.attributes['href'].nodeValue elif i.nodeName == 'author': - if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name': - name = i.firstChild - if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section': - m['author'] = name.firstChild.nodeValue + for k in i.childNodes: + if k.nodeName == 'name': + m['author'] = k.nodeValue + break + elif len(k.childNodes) == 1 and k.firstChild.nodeName == '#cdata-section': + m['author'] = k.firstChild.nodeValue + break else: if len(i.childNodes) == 0: m[i.nodeName] = i.nodeValue @@ -115,7 +127,7 @@ class RSSParser: self._type = 'atom' return True - def __checkasrss3(self): + def __checkasrss2(self): self._root = self.xmldoc.documentElement if self._root.localName != 'rss' or len(self._root.childNodes) != 1: return False @@ -147,16 +159,29 @@ class RSSParser: if 'ttl' in m and m['ttl'] is not None and m['ttl'].isnumeric(): self.ttl = int(m['ttl']) self.itemList = itemList - self._type = 'rss3.0' + self._type = 'rss2.0' return True def __dealItem(self, node): m = {} for i in node.childNodes: - if len(i.childNodes) == 0: + if i.nodeName == 'link': + if len(i.childNodes) == 0: + m[i.nodeName] = i.nodeValue + else: + m[i.nodeName] = '' + for k in i.childNodes: + m[i.nodeName] = m[i.nodeName] + k.toxml() + break + for i in node.childNodes: + if i.nodeName == 'link': + continue + elif len(i.childNodes) == 0: m[i.nodeName] = i.nodeValue elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section': p = HTMLSimpleParser() + if 'link' in m and m['link'] is not None: + p.baseUrl = m['link'] p.feed(i.firstChild.nodeValue) if p.data == '' and i.firstChild.nodeValue.find('<') == -1: m[i.nodeName] = i.firstChild.nodeValue @@ -174,28 +199,95 @@ class RSSParser: def __dealItemAtom(self, node): m = {} for i in node.childNodes: - if i.nodeName == 'author': - if len(i.childNodes) == 1 and i.firstChild.nodeName == 'name': - name = i.firstChild - if len(name.childNodes) == 1 and name.firstChild.nodeName == '#cdata-section': - m['author'] = name.firstChild.nodeValue - elif i.nodeName == 'link': + if i.nodeName == 'link': if 'href' in i.attributes: m[i.nodeName] = i.attributes['href'].nodeValue + for i in node.childNodes: + if i.nodeName == 'author': + for k in i.childNodes: + if k.nodeName == 'name': + if k.nodeValue is not None: + m['author'] = k.nodeValue + break + elif len(k.childNodes) == 1 and k.firstChild.nodeName == '#cdata-section': + m['author'] = k.firstChild.nodeValue + break + elif i.nodeName == 'link': + continue + elif i.nodeName in ['title', 'content', 'summary']: + typ = 'text' + if 'type' in i.attributes: + if i.attributes['type'].nodeValue in ['text', 'html', 'xhtml']: + typ = i.attributes['type'].nodeValue + if len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section': + p = HTMLSimpleParser() + if 'link' in m and m['link'] is not None: + p.baseUrl = m['link'] + p.feed(i.firstChild.nodeValue) + if p.data == '' and i.firstChild.nodeValue.find('<') == -1: + m[i.nodeName] = i.firstChild.nodeValue + else: + m[i.nodeName] = p.data + if i.nodeName in ['content', 'summary']: + m['imgList'] = p.imgList + m['videoList'] = p.videoList + m['description'] = m[i.nodeName] + del m[i.nodeName] + elif i.nodeValue is None and len(i.childNodes) == 0: + continue + elif typ == 'text': + s = '' + if i.nodeValue is not None: + s = i.nodeValue + else: + for k in i.childNodes: + s = s + k.toxml() + m[i.nodeName] = unescape(s) + elif typ == 'html': + s = '' + if i.nodeValue is not None: + s = i.nodeValue + else: + for k in i.childNodes: + s = s + k.toxml() + p = HTMLSimpleParser() + if 'link' in m and m['link'] is not None: + p.baseUrl = m['link'] + p.feed(unescape(s)) + if p.data == '' and i.firstChild.nodeValue.find('<') == -1: + m[i.nodeName] = i.firstChild.nodeValue + else: + m[i.nodeName] = p.data + if i.nodeName in ['content', 'summary']: + m['imgList'] = p.imgList + m['videoList'] = p.videoList + m['description'] = m[i.nodeName] + del m[i.nodeName] + elif typ == 'xhtml': + p = HTMLSimpleParser() + if 'link' in m and m['link'] is not None: + p.baseUrl = m['link'] + p.feed(i.firstChild.toxml()) + if p.data == '' and i.firstChild.nodeValue.find('<') == -1: + m[i.nodeName] = i.firstChild.nodeValue + else: + m[i.nodeName] = p.data + if i.nodeName in ['content', 'summary']: + m['imgList'] = p.imgList + m['videoList'] = p.videoList + m['description'] = m[i.nodeName] + del m[i.nodeName] elif len(i.childNodes) == 0: m[i.nodeName] = i.nodeValue elif len(i.childNodes) == 1 and i.firstChild.nodeName == '#cdata-section': p = HTMLSimpleParser() + if 'link' in m and m['link'] is not None: + p.baseUrl = m['link'] p.feed(i.firstChild.nodeValue) if p.data == '' and i.firstChild.nodeValue.find('<') == -1: m[i.nodeName] = i.firstChild.nodeValue else: m[i.nodeName] = p.data - if i.nodeName == 'content': - m['imgList'] = p.imgList - m['videoList'] = p.videoList - m['description'] = m['content'] - del m['content'] else: m[i.nodeName] = '' for k in i.childNodes: @@ -203,14 +295,15 @@ class RSSParser: return m def check(self): - try: - checked = self.__checkasrss3() - if not checked: - checked = self.__checkasratom() - return checked - except: - print(format_exc()) - return False + for f in [self.__checkasrss2, self.__checkasratom]: + try: + if f(): + self.m['_type'] = self._type + return True + except: + print(format_exc()) + pass + return False def normalize(self): self.removeblank(self.xmldoc.documentElement) @@ -243,4 +336,7 @@ if __name__ == "__main__": fn = sys.argv[1] p = RSSParser() p.parse(fn) - p.check() + if p.check(): + print(p._type) + else: + print('解析失败')