From 92a054324a721d5dc54f763c26cf185671dbe8f7 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Tue, 25 Jan 2022 22:30:23 +0800 Subject: [PATCH] Update --- google_play.py | 393 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 387 insertions(+), 6 deletions(-) diff --git a/google_play.py b/google_play.py index b93360f..3bf2118 100644 --- a/google_play.py +++ b/google_play.py @@ -1,10 +1,10 @@ from requests import Session from http.cookiejar import MozillaCookieJar from html.parser import HTMLParser -from typing import Any, Callable, Dict, List, Tuple, Optional +from typing import Any, Callable, Dict, List, Tuple, Optional, Union from js2py import eval_js from os import makedirs, remove -from os.path import join, exists, relpath, abspath +from os.path import join, exists, relpath, abspath, basename, splitext from traceback import print_exc from json import load, dump, loads from urllib.parse import urljoin, parse_qs, urlparse @@ -15,8 +15,9 @@ from textwrap import wrap as _wrap from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile from subprocess import Popen, DEVNULL from xml.etree import ElementTree as ET -from time import gmtime, strftime, strptime +from time import gmtime, strftime, strptime, time from calendar import timegm +from posixpath import join as posixjoin def get_iso8601_time(d: int) -> str: @@ -29,12 +30,20 @@ def parse_time(s: str) -> int: DC_NS = 'http://purl.org/dc/elements/1.1/' OPF_NS = 'http://www.idpf.org/2007/opf' +SVG_NS = 'http://www.w3.org/2000/svg' +XLINK_NS = 'http://www.w3.org/1999/xlink' +XHTML_NS = 'http://www.w3.org/1999/xhtml' +SVG_TAGS = ['svg', '{%s}svg' % (SVG_NS)] +SWITCH_TAGS = ['switch', '{%s}switch' % (OPF_NS)] ALLOW_DC_LIST = ['coverage', 'description', 'format', 'publisher', 'relation', 'rights', 'source', 'type'] +ALL_IMAGE_MIMETYPES = ['image/gif', 'image/jpeg', 'image/png', 'image/svg+xml', 'image/webp'] +EXT_MIMETYPES = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png', '.gif': 'image/gif', '.svg': 'image/svg+xml', '.webp': 'image/webp'} PREFIX_DICT = {'rendition': 'http://www.idpf.org/vocab/rendition/#', 'ebpaj': 'http://www.ebpaj.jp/', 'fixed-layout-jp': 'http://www.digital-comic.jp/', 'kadokawa': 'http://www.access-company.com/2012/layout#', - 'ibooks': 'http://vocabulary.itunes.apple.com/rdf/ibooks/vocabulary-extensions-1.0/'} + 'ibooks': 'http://vocabulary.itunes.apple.com/rdf/ibooks/vocabulary-extensions-1.0/', + 'dcterms': 'https://www.dublincore.org/specifications/dublin-core/dcmi-terms/'} js_dumps: Callable[[Any], str] = eval_js('function(a){return JSON.stringify(a);}') @@ -123,6 +132,33 @@ class EPUBMeta: return e +class ADEPageMap: + def __init__(self, p): + self.location = 'page_map.xml' + self.pages: List[Union[str, Tuple[str, str]]] = [] + self.id = 'page_map' + self._p: EPUBPackage = p + + def add_page(self, href: str, name: str = None): + if name is None: + self.pages.append(href) + else: + self.pages.append((name, href)) + + def dump(self): + map = ET.Element('page-map', {'xmlns': OPF_NS}) + for i in self.pages: + d = {'href': i, 'name': ''} if isinstance(i, str) else {'href': i[1], 'name': i[0]} + if not self._p.manifest.have_href(d['href']): + raise ValueError(f"Can not find href {d['href']} in manifest.") + map.append(ET.Element('page', d)) + return ET.tostring(map, 'UTF-8') + + def save(self, f: ZipFile): + f.writestr(self.location, self.dump()) + print(f'Added {self.location} to archive.') + + class EPUBMetadata: def __init__(self, p): self._p: EPUBPackage = p @@ -199,6 +235,8 @@ class EPUBMetadata: metadata.append(e) for i in self.metas: metadata.append(i.dump()) + lmt = EPUBMeta('dcterms:modified', get_iso8601_time(time())) + metadata.append(lmt.dump()) return metadata def get_unique_identifier(self) -> Optional[str]: @@ -224,7 +262,7 @@ class EPUBMetadata: @property def prefix(self): - r = [] + r = [f"dcterms: {PREFIX_DICT['dcterms']}"] for i in self.metas: if i.prefix_url: if i.prefix_url not in r: @@ -232,15 +270,189 @@ class EPUBMetadata: return ' '.join(r) +class EPUBItem: + def __init__(self, href: str, id: str, media_type: str, file: str = None, properties: List[str] = None, fallback: str = None, media_overlay: str = None): + self.href = href + self.id = id + self.media_type = media_type + self.file = file + self.properties = properties + self.fallback = fallback + self.media_overlay = media_overlay + self.file = file + + def dump(self) -> ET.Element: + e = ET.Element('item', {'href': self.href, 'id': self.id, 'media-type': self.media_type}) + if self.properties: + e.attrib['properties'] = ' '.join(self.properties) + if self.fallback: + e.attrib['fallback'] = self.fallback + if self.media_overlay: + e.attrib['media-overlay'] = self.media_overlay + return e + + def save(self, f: ZipFile): + if self.file: + f.write(self.file, self.href) + print(f'Added {self.file} to {self.href} in archive.') + + +class EPUBManifest: + def __init__(self, p): + self._p: EPUBPackage = p + self.items: List[EPUBItem] = [] + + def add_item(self, href: str, id: str, media_type: str, file: str = None, properties: List[str] = None, fallback: str = None, media_overlay: str = None): + if id == 'nav' or (self._p.page_map and self._p.page_map.id == id): + raise ValueError(f'id {id} already used.') + for i in self.items: + if i.href == href: + raise ValueError(f'href {href} already used.') + elif i.id == id: + raise ValueError(f'id {id} already used.') + self.items.append(EPUBItem(href, id, media_type, file, properties, fallback, media_overlay)) + + def add_cover(self, href: str, id: str, media_type: str, file: str): + if self.have_cover_image: + raise ValueError('Only one cover image.') + if media_type not in ALL_IMAGE_MIMETYPES: + raise ValueError(f'Unsupported media type {media_type}') + self.add_item(href, id, media_type, file, ['cover-image']) + + def dump(self): + items = self.items + [EPUBItem(self._p.page_map.location, self._p.page_map.id, 'application/oebps-page-map+xml')] if self._p.page_map else self.items + items.append(EPUBItem(self._p.nav.location, 'nav', 'application/xhtml+xml', properties=['nav'])) + e = ET.Element('manifest') + for i in items: + e.append(i.dump()) + return e + + @property + def have_cover_image(self): + for i in self.items: + if i.properties and 'cover-image' in i.properties: + return True + return False + + def have_href(self, href: str): + if '#' in href: + href = href[:href.rfind('#')] + for i in self.items: + if i.href == href: + return True + return False + + def have_id(self, id: str): + for i in self.items: + if i.id == id: + return True + return False + + def save(self, f: ZipFile): + for i in self.items: + i.save(f) + + +class EPUBItemRef: + def __init__(self, idref: str, linear: str = None, properties: List[str] = None, id: str = None): + self.idref = idref + self.linear = linear + self.properties = properties + self.id = id + + def dump(self): + e = ET.Element('itemref', {'idref': self.idref}) + if self.linear: + e.attrib['linear'] = self.linear + if self.properties: + e.attrib['properties'] = ' '.join(self.properties) + if self.id: + e.attrib['id'] = self.id + return e + + +class EPUBSpine: + def __init__(self, p): + self._p: EPUBPackage = p + self.refs: List[EPUBItemRef] = [] + self.page_progression_direction = None + + def add_ref(self, idref: str, linear: str = None, properties: List[str] = None, id: str = None): + self.refs.append(EPUBItemRef(idref, linear, properties, id)) + + def dump(self): + e = ET.Element('spine') + if self.page_progression_direction: + e.attrib['page-progression-direction'] = self.page_progression_direction + for ref in self.refs: + if not self._p.manifest.have_id(ref.idref): + raise ValueError(f"Can not find id {ref.idref} in manifest.") + e.append(ref.dump()) + return e + + +class EPUBNav: + def __init__(self, text: str, href: str = None): + self.text = text + self.href = href + self.childrens: List[EPUBNav] = [] + + def dump(self): + e = ET.Element('li') + a = ET.Element('a' if self.href else 'span', {'href': self.href} if self.href else {}) + a.text = self.text + e.append(a) + if self.childrens: + ol = ET.Element('ol') + for i in self.childrens: + ol.append(i.dump()) + e.append(ol) + return e + + +class EPUBNavigation: + def __init__(self): + self.location = 'nav.xhtml' + self.head = None + self.navs: List[EPUBNav] = [] + + def dump(self): + e = ET.Element('nav', {'xmlns:epub': OPF_NS, 'epub:type': 'toc', 'id': 'toc'}) + if len(self.navs) < 1: + raise InvalidEPUB('At least one nav element is needed.') + if self.head: + h = ET.Element('h1') + h.text = self.head + e.append(h) + ol = ET.Element('ol') + for i in self.navs: + ol.append(i.dump()) + e.append(ol) + return ET.tostring(e, 'UTF-8') + + def save(self, f: ZipFile): + f.writestr(self.location, self.dump()) + print(f'Added {self.location} to archive.') + + class EPUBPackage: def __init__(self, location: str = 'package.opf'): self.location = location self.metadata = EPUBMetadata(self) self.language = '' + self.manifest = EPUBManifest(self) + self.spine = EPUBSpine(self) + self.page_map: Optional[ADEPageMap] = None + self.nav = EPUBNavigation() def add_identifier(self, value: str, id: str = None): self.metadata.add_identifier(value, id) + def add_page(self, href: str, name: str = None): + if self.page_map is None: + self.page_map = ADEPageMap(self) + self.page_map.add_page(href, name) + def dump(self): ide = self.metadata.get_unique_identifier() if ide is None: @@ -252,11 +464,17 @@ class EPUBPackage: if prefix: root.attrib['prefix'] = prefix root.append(self.metadata.dump()) + root.append(self.manifest.dump()) + root.append(self.spine.dump()) return ET.tostring(root, 'UTF-8') def save(self, f: ZipFile): f.writestr(self.location, self.dump()) print(f'Added {self.location} to archive.') + self.nav.save(f) + if self.page_map: + self.page_map.save(f) + self.manifest.save(f) @property def unique_identifier(self): @@ -313,6 +531,9 @@ class EPUB: self.packages = self.container.packages self.package = self.packages[0] self.metadata = self.package.metadata + self.manifest = self.package.manifest + self.spine = self.package.spine + self.nav = self.package.nav def add_identifier(self, value: str, id: str = None): self.package.add_identifier(value, id) @@ -327,6 +548,86 @@ class EPUB: self.container.save(f) +class XHTMLConvert: + def __init__(self, root: ET.Element): + self.root = root + self.have_svg = False + self.scripted = False + self.have_remote_resources = False + self.have_switch = False + self.head = self.root.find('head') + if self.head is None: + raise ValueError('Can not find head element.') + self.body = self.root.find('body') + if self.body is None: + raise ValueError('Can not find body element.') + self.head.append(ET.Element('meta', {'charset': 'UTF-8'})) + self.title = None + + def add_css(self, href): + self.head.append(ET.Element('link', {'href': href, 'rel': 'stylesheet', 'type': 'text/css'})) + + def convert(self, url_maps: Dict[str, str], allow_remote_resources: bool = False): + if self.root.find('script'): + self.scripted = True + for i in SVG_TAGS: + if self.root.find(i): + self.have_svg = True + for i in SWITCH_TAGS: + if self.root.find(i): + self.have_switch = True + for i in self.body.iter(): + if i.tag == '{%s}image' % (SVG_NS): + for tag in ['href', '{%s}href' % (XLINK_NS)]: + if tag in i.attrib: + url = i.attrib[tag] + re = urlparse(url) + if re.scheme or re.hostname: + if url in url_maps: + i.attrib[tag] = url_maps[url] + else: + if allow_remote_resources: + self.have_remote_resources = True + else: + raise ValueError(f'Unknown remote resource: {url}') + elif i.tag == 'img': + if 'src' in i.attrib: + url = i.attrib['src'] + re = urlparse(url) + if re.scheme or re.hostname: + if url in url_maps: + i.attrib['src'] = url_maps[url] + else: + if allow_remote_resources: + self.have_remote_resources = True + else: + raise ValueError(f'Unknown remote resource: {url}') + + @property + def properties(self) -> Optional[List[str]]: + r = [] + if self.have_svg: + r.append('svg') + if self.have_switch: + r.append('switch') + if self.have_remote_resources: + r.append('remote-resources') + if self.scripted: + r.append('scripted') + return r if len(r) else None + + def save(self, fn: str): + with open(fn, 'wb') as f: + f.write(ET.tostring(self.root, 'UTF-8')) + print(f'Writed XHMTL to {fn}.') + + def set_title(self, title: str): + if self.title is None: + self.title = ET.Element('title') + self.head.append(self.title) + self.title.text = title + + def detect_7z() -> bool: try: pro = Popen(['7z', '-h'], stdout=DEVNULL) @@ -587,6 +888,13 @@ def get_genres(d: dict) -> Optional[str]: return None +def find_page(d, pid): + for i in d: + if i['pid'] == pid: + return i + return None + + cookies = MozillaCookieJar('google.txt') cookies.load() ses = Session() @@ -816,7 +1124,7 @@ try: with open(file_list_loc, 'w', encoding='UTF-8') as f: f.write('\n'.join(file_list)) add_7z_archive(output, file_list_loc, filename, argsd['7z_compress_level']) - else: + elif args.type == 'EPUB': output = args.output if args.output else f'{authors} - {title}.epub' webre = ses.get(f'https://play.google.com/store/books/details/?id={book_id}') if webre.status_code >= 400: @@ -853,6 +1161,79 @@ try: e.metadata.add_data('publisher', publisher) for i in meta[0]['meta']: e.metadata.add_meta(i['property'], i['cdata']) + if meta[0]['is_right_to_left']: + e.spine.page_progression_direction = 'rtl' + for i in meta[0]['toc_entry']: + if i['depth'] != 0: + raise NotImplementedError('Non-zero depth toc.') + seg_meta = meta[0]['segment'][i['segment_index']] + label = seg_meta['label'] + if not label.endswith('.xhtml'): + label += '.xhtml' + href = posixjoin("xhtml", label) + e.nav.navs.append(EPUBNav(i['label'], href)) + cover_meta = meta[0]['segment'][0] + seg_file = join(filename, f"{cover_meta['label']}.json") + with open(seg_file, 'r', encoding='UTF-8') as f: + seg_info = load(f) + for res in seg_info['resource']: + if res['mime_type'] == 'image': + res_info = resources[res['url']] + href = posixjoin('image', basename(res_info['file'])) + id = splitext(basename(res_info['file']))[0] + mimetype = EXT_MIMETYPES[splitext(res_info['file'])[1]] + e.manifest.add_cover(href, id, mimetype, res_info['file']) + break + ET.register_namespace('epub', OPF_NS) + ET.register_namespace('svg', SVG_NS) + ET.register_namespace('xlink', XLINK_NS) + for seg_meta in meta[0]['segment']: + seg_file = join(filename, f"{seg_meta['label']}.json") + with open(seg_file, 'r', encoding='UTF-8') as f: + seg_info = load(f) + if 'content' in seg_info: + if seg_info['content_encrypted'] and decrypter is not None: + seg_info['content'] = decrypter.decrypt(b64decode(segment['content'])).decode() + seg_info['content_encrypted'] = False + tree: ET.Element = ET.fromstring(f"{seg_info['content']}") + converter = XHTMLConvert(tree) + url_maps = {} + for i in seg_info['resource']: + res = resources[i['url']] + if i['mime_type'] == 'text/css': + href = posixjoin('css', basename(res['file'])) + id = splitext(basename(res['file']))[0] + if not e.manifest.have_href(href): + e.manifest.add_item(href, id, 'text/css', res['file']) + converter.add_css(posixjoin('..', href)) + elif i['mime_type'] in 'image': + href = posixjoin('image', basename(res['file'])) + id = splitext(basename(res['file']))[0] + mimetype = EXT_MIMETYPES[splitext(res['file'])[1]] + if not e.manifest.have_href(href): + e.manifest.add_item(href, id, mimetype, res['file']) + url_maps[i['url']] = posixjoin('..', href) + converter.set_title(title) + print(url_maps) + converter.convert(url_maps) + for i in converter.root.iter(): + print(i, i.text, i.attrib) + label = seg_meta['label'] + if not label.endswith('.xhtml'): + label += '.xhtml' + href = posixjoin('xhtml', label) + path = join(filename, label) + converter.save(path) + e.manifest.add_item(href, seg_meta['label'], 'application/xhtml+xml', path, converter.properties) + e.spine.add_ref(seg_meta['label'], 'yes') + if 'page' in seg_info: + for i in seg_info['page']: + p = find_page(meta[0]['page'], i['pid']) + if p is None: + raise ValueError(f"Can not find page {i['pid']}") + e.package.add_page(f"{href}#{i['pid']}", p['title']) + else: + print(f"No content in segment {seg_meta['label']}") e.save(output) finally: cookies.save()