diff --git a/.gitignore b/.gitignore index 076249d..b4fb961 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,4 @@ cython_debug/ *.json *.db exported/ +img_cache/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..858b08d --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# Setup +## Termux +```shell +pkg install git libxml2 libxslt python ffmpeg +git clone https://github.com/lifegpc/cwm_export +cd cwm_export +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` diff --git a/config.py b/config.py index e014209..a59e243 100644 --- a/config.py +++ b/config.py @@ -35,6 +35,8 @@ class Config: if key in self._data: return self._data[key] else: + if default is not None: + self._data[key] = default return default @cached_property @@ -90,6 +92,10 @@ class Config: temp = temp.replace(f'<{k}>', str(chapter[k])) return temp + @cached_property + def img_cache_dir(self): + return self.get_arg('img_cache_dir', 'img_cache') + @cached_property def key(self): return self.get_arg('key', None) diff --git a/epub.py b/epub.py new file mode 100644 index 0000000..e9c8618 --- /dev/null +++ b/epub.py @@ -0,0 +1,359 @@ +from ebooklib import epub, ITEM_IMAGE +from lxml import etree +import subprocess +from typing import Optional +import os +from config import Config +from image_cache import get_cache +from traceback import print_exc +import xml.etree.ElementTree as ET +from html.parser import HTMLParser +try: + import magic + have_magic = True +except ImportError: + have_magic = False + print('Warning: python-magic not found. The mimetype in EPUB file may wrong.') # noqa: E501 + import platform + if platform.system() == "Windows": + print('python-magic-bin is also needed on Windows.') + + +# Add fallback property to ebooklib +# ebooklib does not support fallback +class EpubItem(epub.EpubItem): + def __init__(self, uid=None, file_name='', media_type='', + content=epub.six.b(''), manifest=True): + super().__init__(uid, file_name, media_type, content, manifest) + self.fallback = None + + +class EpubImage(EpubItem): + def __init__(self): + super().__init__() + + def get_type(self): + return ITEM_IMAGE + + def __str__(self): + return '' % (self.id, self.file_name) + + +class EpubWriter(epub.EpubWriter): + def _write_opf_manifest(self, root): + manifest = epub.etree.SubElement(root, 'manifest') + _ncx_id = None + + for item in self.book.get_items(): + if not item.manifest: + continue + + if isinstance(item, epub.EpubNav): + etree.SubElement(manifest, 'item', {'href': item.get_name(), + 'id': item.id, + 'media-type': item.media_type, # noqa: E501 + 'properties': 'nav'}) + elif isinstance(item, epub.EpubNcx): + _ncx_id = item.id + etree.SubElement(manifest, 'item', {'href': item.file_name, + 'id': item.id, + 'media-type': item.media_type}) # noqa: E501 + + elif isinstance(item, epub.EpubCover): + etree.SubElement(manifest, 'item', {'href': item.file_name, + 'id': item.id, + 'media-type': item.media_type, # noqa: E501 + 'properties': 'cover-image'}) # noqa: E501 + else: + opts = {'href': item.file_name, + 'id': item.id, + 'media-type': item.media_type} + + if hasattr(item, 'properties') and len(item.properties) > 0: + opts['properties'] = ' '.join(item.properties) + + if hasattr(item, 'media_overlay') and item.media_overlay is not None: # noqa: E501 + opts['media-overlay'] = item.media_overlay + + if hasattr(item, 'media_duration') and item.media_duration is not None: # noqa: E501 + opts['duration'] = item.media_duration + + if hasattr(item, 'fallback') and item.fallback is not None: + opts['fallback'] = item.fallback + + etree.SubElement(manifest, 'item', opts) + + return _ncx_id + + +have_ffmpeg = None + + +def check_ffmpeg(): + p = subprocess.Popen(['ffmpeg', '-h'], stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) + p.communicate() + global have_ffmpeg + have_ffmpeg = not p.wait() + if not have_ffmpeg: + print('Warning: Can not find ffmpeg. Some epub readers may failed to open these pictures.') # noqa: E501 + + +def perform_convert(image_path: str) -> Optional[str]: + output_path = os.path.splitext(image_path)[0] + '_fallback.jpg' + if os.path.exists(output_path) and os.path.getsize(output_path) > 4096: + return output_path + if have_ffmpeg is None: + check_ffmpeg() + if have_ffmpeg: + p = subprocess.Popen(['ffmpeg', '-y', '-i', image_path, output_path], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) + p.communicate() + code = p.wait() + if not code: + return output_path + else: + print(f'Warning: Can not convert images by using ffmpeg. (Exit code: {code}) Some epub readers may failed to open these pictures.') # noqa: E501 + return None + else: + return None + + +class HTMLImage: + def __init__(self, attrs, cfg: Config): + self.src = None + self.alt = None + self.path = None + self.epub_path = None + for key, value in attrs: + if key == 'src': + self.src = value + elif key == 'alt': + self.alt = value + self.cfg = cfg + + def is_valid(self): + return self.src is not None + + def download_image(self): + if not self.is_valid(): + return False + try: + self.path = get_cache(self.cfg, self.src) + return True + except Exception: + print_exc() + + def to_local(self): + if not self.is_valid(): + return "" + if not self.download_image(): + raise ValueError("Failed to download image.") + self.epub_path = os.path.basename(self.path) + if have_magic: + with open(self.path, 'rb') as f: + mime = magic.from_buffer(f.read(4096), True) + self.epub_path = os.path.splitext(self.epub_path)[0] + get_extension(mime) # noqa: E501 + d = {'src': self.epub_path} + if self.alt: + d['alt'] = self.alt + return ET.tostring(ET.Element('img', d), 'unicode') + + +# Used to parse content +class ContentParser(HTMLParser): + def __init__(self, cfg: Config): + super().__init__() + self._in_paragraph = False + self.data = [] + # Local image file lists + self.images = [] + self._paragraph_data = '' + self.cfg = cfg + + def handle_data(self, data: str): + if self._in_paragraph: + if isinstance(self._paragraph_data, str): + self._paragraph_data += data + elif isinstance(self._paragraph_data, list): + self._paragraph_data.append(data) + + def handle_starttag(self, tag, attrs): + if tag == 'img': + if self._in_paragraph: + if self._paragraph_data: + self._paragraph_data = [self._paragraph_data] + else: + self._paragraph_data = [] + self._paragraph_data.append(HTMLImage(attrs, self.cfg)) + else: + self.data.append(HTMLImage(attrs, self.cfg)) + elif tag == 'p': + self._in_paragraph = True + elif tag == 'book': + pass + else: + raise NotImplementedError() + + def handle_endtag(self, tag: str): + if tag == 'img': + pass + elif tag == 'p': + self._in_paragraph = False + if self._paragraph_data: + self.data.append(self._paragraph_data) + self._paragraph_data = '' + else: + raise NotImplementedError() + + def have_image(self, data_list=None) -> bool: + if data_list is None: + data_list = self.data + for i in data_list: + if isinstance(i, HTMLImage): + if i.is_valid(): + return True + elif isinstance(i, list): + if self.have_image(i): + return True + return False + + def to_local(self, data_list=None) -> str: + default_data_list = False + if data_list is None: + data_list = self.data + default_data_list = True + data = '' + for i in data_list: + if isinstance(i, str): + if default_data_list: + data += f'

{i}

\n' + else: + data += i + elif isinstance(i, HTMLImage): + if i.is_valid(): + try: + data += i.to_local() + self.images.append(i) + except ValueError: + print("the image is not valid.", i.src) + elif isinstance(i, list): + data += f'

{self.to_local(i)}

\n' + else: + raise NotImplementedError() + if self._paragraph_data: + data += f'

{self._paragraph_data}

\n' + return data + + +def get_extension(mime: str) -> str: + if mime == 'image/gif': + return '.gif' + elif mime == 'image/jpeg': + return '.jpeg' + elif mime == 'image/png': + return '.png' + elif mime == 'image/svg+xml': + return '.svg' + elif mime == 'image/webp': # EPUB 3.3 Draft + return '.webp' + else: + print(mime) + raise NotImplementedError() + + +class EpubFile: + def __init__(self, cfg: Config, out: str): + self.epub = epub.EpubBook() + self.EpubList = list() + self.epub.set_language('zh-CN') + self.cfg = cfg + self.out = out + + def set_book(self, book): + self.epub.set_identifier(str(book['book_id'])) + self.epub.set_title(book['book_name']) + self.epub.add_author(book['author_name']) + cover_path = get_cache(self.cfg, book['cover']) + with open(cover_path, 'rb') as f: + cover = f.read() + if have_magic: + mime_type = magic.from_buffer(cover, mime=True) + file_name = 'cover' + get_extension(mime_type) + else: + file_name = 'cover.png' + self.epub.set_cover(file_name, cover) + intro = epub.EpubHtml(title='book-detailed', file_name='intro.xhtml', + lang='zh-CN') + intro.content = f''' +

书名:{book['book_name']}

ID:{book['book_id']}

+

作者:{book['author_name']}

更新时间:{book['uptime']}

+

最新章节:{book['last_chapter_info']['chapter_title']}

''' + self.epub.add_item(intro) + self.EpubList.append(intro) + self.epub.spine.append(intro) + self.last_division_name = '' + + def add_chapter(self, chapter, content: str, division_name: str): + chapter_title = chapter['chapter_title'] + chapter_id = chapter['chapter_id'] + ch = epub.EpubHtml( + title=chapter_title, + file_name=f'ch{chapter_id}.xhtml', + lang='zh-CN', + uid=f'ch{chapter_id}', + ) + if division_name == '作品相关': + ch.is_linear = False + parser = ContentParser(self.cfg) + contents = content.splitlines() + try: + parser.feed('

' + '

\n

'.join(contents) + '

') + except Exception as e: + print('

' + '

\n

'.join(contents) + '

') + raise e + parser.close() + ch.content = f'

{chapter_title}

\n{parser.to_local()}' # noqa: E501 + self.epub.add_item(ch) + count = 0 + for oimg in parser.images: + with open(oimg.path, 'rb') as f: + img_content = f.read() + img = EpubImage() + img.file_name = oimg.epub_path + img.content = img_content + img.id = f'i{chapter_id}_{count}' + self.epub.add_item(img) + if oimg.epub_path.endswith('.webp'): + img.media_type = 'image/webp' + jpg_path = perform_convert(oimg.path) + if jpg_path is not None: + jpg_img = epub.EpubImage() + with open(jpg_path, 'rb') as f: + jpg_content = f.read() + jpg_img.file_name = os.path.basename(jpg_path) + jpg_img.content = jpg_content + jpg_img.id = img.id + 'f' + img.fallback = jpg_img.id + self.epub.add_item(jpg_img) + count += 1 + if self.last_division_name != division_name: + self.EpubList.append([epub.Link(ch.file_name, division_name), []]) + self.last_division_name = division_name + if isinstance(self.EpubList[-1], list): + self.EpubList[-1][-1].append(ch) + else: + self.EpubList.append(ch) + self.epub.spine.append(ch) + + def save_epub_file(self): # save epub file to local + # the path to save epub file to local + self.epub.toc = self.EpubList + self.epub.add_item(epub.EpubNcx()), self.epub.add_item(epub.EpubNav()) + book = EpubWriter(self.out, self.epub, {}) + book.process() + try: + book.write() + except IOError: + print_exc() diff --git a/export.py b/export.py index 5d247d6..93f2502 100644 --- a/export.py +++ b/export.py @@ -6,6 +6,7 @@ from booksnew import BooksNew from crypto import decrypt from os.path import dirname from os import makedirs +from epub import EpubFile key_imported = False @@ -56,6 +57,14 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew, d = dirname(txt_filename) makedirs(d, exist_ok=True) txt = open(txt_filename, 'w', encoding='UTF-8') + if cfg.export_epub: + try: + epub = EpubFile(cfg, cfg.get_export_book(book, 'epub')) + epub.set_book(book) + except Exception as e: + if cfg.export_txt: + txt.close() + raise e try: chapters = ncw.get_chapter_with_bookid(book_id) divisions = ncw.get_divisions_with_bookid(book_id) @@ -68,10 +77,13 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew, else: maps[division_id] = [chapter] for division in divisions: + division_name = division['division_name'] if cfg.export_txt: - txt.write(f"第{division['division_index']}卷 {division['division_name']}\n") # noqa: E501 + txt.write(f"第{division['division_index']}卷 {division_name}\n") if division['description']: txt.write(division['description'] + '\n\n') + if cfg.export_epub and division['description']: + print('TODO: add division description to epub.') chapter_index = 1 for chapter in maps[division['division_id']]: if chapter['is_download']: @@ -82,6 +94,8 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew, if cfg.export_txt: txt.write(f"第{chapter_index}章 {chapter_title}\n") txt.write(content + '\n\n') + if cfg.export_epub: + epub.add_chapter(chapter, content, division_name) count += 1 else: if cfg.export_txt: @@ -91,3 +105,5 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew, finally: if cfg.export_txt: txt.close() + if cfg.export_epub: + epub.save_epub_file() diff --git a/image_cache.py b/image_cache.py new file mode 100644 index 0000000..ca3e9cd --- /dev/null +++ b/image_cache.py @@ -0,0 +1,34 @@ +from urllib.parse import urlparse +from os.path import exists, join, dirname +from os import makedirs +from config import Config +import requests + + +def try_fetch(url): + for _ in range(5): + try: + re = requests.get(url=url) + if re.status_code == 200: + return re.content + except Exception: + pass + raise ValueError(f'HTTP ERROR {re.status_code} {re.reason}.') + raise ValueError('Failed to fetch the image.') + + +def get_cache(cfg: Config, url: str): + u = urlparse(url) + path = u.path + if path.endswith('/'): + path = path[:-1] + path = join(cfg.img_cache_dir, path[1:]) + if exists(path): + return path + else: + img = try_fetch(url) + d = dirname(path) + makedirs(d, exist_ok=True) + with open(path, 'wb') as f: + f.write(img) + return path diff --git a/main.py b/main.py index ad673d0..9f146c3 100644 --- a/main.py +++ b/main.py @@ -17,6 +17,7 @@ parser.add_argument('-r', '--real', help='Use default locations. Needed running parser.add_argument('-B', '--bid', '--book-id', help='The book id.', type=int) parser.add_argument('-t', '--type', help='Export type. Available types: epub, txt. Default: epub,txt') # noqa: E501 parser.add_argument('--ebt', '--export-book-template', help='The template of the exported book. Available key: , , , eta.') # noqa: E501 +parser.add_argument('--icd', '--image-cache-dir', help='Path to image cache directory.') # noqa: E501 parser.add_argument('action', help='The action to do.', choices=['importkey', 'exportchapter', 'exportbook']) # noqa: E501 diff --git a/requirements.txt b/requirements.txt index e218f03..54a34ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,7 @@ PyCryptodome semver +ebooklib +lxml +python-magic +python-magic-bin ; sys_platform == 'win32' +requests