Add epub support

2024-03-08 17:08:58 +08:00
parent f61a2af57d
commit 9cfb73759b
8 changed files with 433 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,4 @@ cython_debug/
 *.json
 *.db
 exported/
+img_cache/
--- a/README.md
+++ b/README.md
@@ -0,0 +1,10 @@
+# Setup
+## Termux
+```shell
+pkg install git libxml2 libxslt python ffmpeg
+git clone https://github.com/lifegpc/cwm_export
+cd cwm_export
+python -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
--- a/config.py
+++ b/config.py
@@ -35,6 +35,8 @@ class Config:
        if key in self._data:
            return self._data[key]
        else:
+            if default is not None:
+                self._data[key] = default
            return default

    @cached_property
@@ -90,6 +92,10 @@ class Config:
            temp = temp.replace(f'<{k}>', str(chapter[k]))
        return temp

+    @cached_property
+    def img_cache_dir(self):
+        return self.get_arg('img_cache_dir', 'img_cache')
+
    @cached_property
    def key(self):
        return self.get_arg('key', None)
--- a/epub.py
+++ b/epub.py
@@ -0,0 +1,359 @@
+from ebooklib import epub, ITEM_IMAGE
+from lxml import etree
+import subprocess
+from typing import Optional
+import os
+from config import Config
+from image_cache import get_cache
+from traceback import print_exc
+import xml.etree.ElementTree as ET
+from html.parser import HTMLParser
+try:
+    import magic
+    have_magic = True
+except ImportError:
+    have_magic = False
+    print('Warning: python-magic not found. The mimetype in EPUB file may wrong.')  # noqa: E501
+    import platform
+    if platform.system() == "Windows":
+        print('python-magic-bin is also needed on Windows.')
+
+
+# Add fallback property to ebooklib
+# ebooklib does not support fallback
+class EpubItem(epub.EpubItem):
+    def __init__(self, uid=None, file_name='', media_type='',
+                 content=epub.six.b(''), manifest=True):
+        super().__init__(uid, file_name, media_type, content, manifest)
+        self.fallback = None
+
+
+class EpubImage(EpubItem):
+    def __init__(self):
+        super().__init__()
+
+    def get_type(self):
+        return ITEM_IMAGE
+
+    def __str__(self):
+        return '<EpubImage:%s:%s>' % (self.id, self.file_name)
+
+
+class EpubWriter(epub.EpubWriter):
+    def _write_opf_manifest(self, root):
+        manifest = epub.etree.SubElement(root, 'manifest')
+        _ncx_id = None
+
+        for item in self.book.get_items():
+            if not item.manifest:
+                continue
+
+            if isinstance(item, epub.EpubNav):
+                etree.SubElement(manifest, 'item', {'href': item.get_name(),
+                                                    'id': item.id,
+                                                    'media-type': item.media_type,  # noqa: E501
+                                                    'properties': 'nav'})
+            elif isinstance(item, epub.EpubNcx):
+                _ncx_id = item.id
+                etree.SubElement(manifest, 'item', {'href': item.file_name,
+                                                    'id': item.id,
+                                                    'media-type': item.media_type})  # noqa: E501
+
+            elif isinstance(item, epub.EpubCover):
+                etree.SubElement(manifest, 'item', {'href': item.file_name,
+                                                    'id': item.id,
+                                                    'media-type': item.media_type,  # noqa: E501
+                                                    'properties': 'cover-image'})  # noqa: E501
+            else:
+                opts = {'href': item.file_name,
+                        'id': item.id,
+                        'media-type': item.media_type}
+
+                if hasattr(item, 'properties') and len(item.properties) > 0:
+                    opts['properties'] = ' '.join(item.properties)
+
+                if hasattr(item, 'media_overlay') and item.media_overlay is not None:  # noqa: E501
+                    opts['media-overlay'] = item.media_overlay
+
+                if hasattr(item, 'media_duration') and item.media_duration is not None:  # noqa: E501
+                    opts['duration'] = item.media_duration
+
+                if hasattr(item, 'fallback') and item.fallback is not None:
+                    opts['fallback'] = item.fallback
+
+                etree.SubElement(manifest, 'item', opts)
+
+        return _ncx_id
+
+
+have_ffmpeg = None
+
+
+def check_ffmpeg():
+    p = subprocess.Popen(['ffmpeg', '-h'], stdout=subprocess.DEVNULL,
+                         stderr=subprocess.DEVNULL)
+    p.communicate()
+    global have_ffmpeg
+    have_ffmpeg = not p.wait()
+    if not have_ffmpeg:
+        print('Warning: Can not find ffmpeg. Some epub readers may failed to open these pictures.')  # noqa: E501
+
+
+def perform_convert(image_path: str) -> Optional[str]:
+    output_path = os.path.splitext(image_path)[0] + '_fallback.jpg'
+    if os.path.exists(output_path) and os.path.getsize(output_path) > 4096:
+        return output_path
+    if have_ffmpeg is None:
+        check_ffmpeg()
+    if have_ffmpeg:
+        p = subprocess.Popen(['ffmpeg', '-y', '-i', image_path, output_path],
+                             stdout=subprocess.DEVNULL,
+                             stderr=subprocess.DEVNULL)
+        p.communicate()
+        code = p.wait()
+        if not code:
+            return output_path
+        else:
+            print(f'Warning: Can not convert images by using ffmpeg. (Exit code: {code}) Some epub readers may failed to open these pictures.')  # noqa: E501
+            return None
+    else:
+        return None
+
+
+class HTMLImage:
+    def __init__(self, attrs, cfg: Config):
+        self.src = None
+        self.alt = None
+        self.path = None
+        self.epub_path = None
+        for key, value in attrs:
+            if key == 'src':
+                self.src = value
+            elif key == 'alt':
+                self.alt = value
+        self.cfg = cfg
+
+    def is_valid(self):
+        return self.src is not None
+
+    def download_image(self):
+        if not self.is_valid():
+            return False
+        try:
+            self.path = get_cache(self.cfg, self.src)
+            return True
+        except Exception:
+            print_exc()
+
+    def to_local(self):
+        if not self.is_valid():
+            return ""
+        if not self.download_image():
+            raise ValueError("Failed to download image.")
+        self.epub_path = os.path.basename(self.path)
+        if have_magic:
+            with open(self.path, 'rb') as f:
+                mime = magic.from_buffer(f.read(4096), True)
+            self.epub_path = os.path.splitext(self.epub_path)[0] + get_extension(mime)  # noqa: E501
+        d = {'src': self.epub_path}
+        if self.alt:
+            d['alt'] = self.alt
+        return ET.tostring(ET.Element('img', d), 'unicode')
+
+
+# Used to parse content
+class ContentParser(HTMLParser):
+    def __init__(self, cfg: Config):
+        super().__init__()
+        self._in_paragraph = False
+        self.data = []
+        # Local image file lists
+        self.images = []
+        self._paragraph_data = ''
+        self.cfg = cfg
+
+    def handle_data(self, data: str):
+        if self._in_paragraph:
+            if isinstance(self._paragraph_data, str):
+                self._paragraph_data += data
+            elif isinstance(self._paragraph_data, list):
+                self._paragraph_data.append(data)
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'img':
+            if self._in_paragraph:
+                if self._paragraph_data:
+                    self._paragraph_data = [self._paragraph_data]
+                else:
+                    self._paragraph_data = []
+                self._paragraph_data.append(HTMLImage(attrs, self.cfg))
+            else:
+                self.data.append(HTMLImage(attrs, self.cfg))
+        elif tag == 'p':
+            self._in_paragraph = True
+        elif tag == 'book':
+            pass
+        else:
+            raise NotImplementedError()
+
+    def handle_endtag(self, tag: str):
+        if tag == 'img':
+            pass
+        elif tag == 'p':
+            self._in_paragraph = False
+            if self._paragraph_data:
+                self.data.append(self._paragraph_data)
+                self._paragraph_data = ''
+        else:
+            raise NotImplementedError()
+
+    def have_image(self, data_list=None) -> bool:
+        if data_list is None:
+            data_list = self.data
+        for i in data_list:
+            if isinstance(i, HTMLImage):
+                if i.is_valid():
+                    return True
+            elif isinstance(i, list):
+                if self.have_image(i):
+                    return True
+        return False
+
+    def to_local(self, data_list=None) -> str:
+        default_data_list = False
+        if data_list is None:
+            data_list = self.data
+            default_data_list = True
+        data = ''
+        for i in data_list:
+            if isinstance(i, str):
+                if default_data_list:
+                    data += f'<p>{i}</p>\n'
+                else:
+                    data += i
+            elif isinstance(i, HTMLImage):
+                if i.is_valid():
+                    try:
+                        data += i.to_local()
+                        self.images.append(i)
+                    except ValueError:
+                        print("the image is not valid.", i.src)
+            elif isinstance(i, list):
+                data += f'<p>{self.to_local(i)}</p>\n'
+            else:
+                raise NotImplementedError()
+        if self._paragraph_data:
+            data += f'<p>{self._paragraph_data}</p>\n'
+        return data
+
+
+def get_extension(mime: str) -> str:
+    if mime == 'image/gif':
+        return '.gif'
+    elif mime == 'image/jpeg':
+        return '.jpeg'
+    elif mime == 'image/png':
+        return '.png'
+    elif mime == 'image/svg+xml':
+        return '.svg'
+    elif mime == 'image/webp':  # EPUB 3.3 Draft
+        return '.webp'
+    else:
+        print(mime)
+        raise NotImplementedError()
+
+
+class EpubFile:
+    def __init__(self, cfg: Config, out: str):
+        self.epub = epub.EpubBook()
+        self.EpubList = list()
+        self.epub.set_language('zh-CN')
+        self.cfg = cfg
+        self.out = out
+
+    def set_book(self, book):
+        self.epub.set_identifier(str(book['book_id']))
+        self.epub.set_title(book['book_name'])
+        self.epub.add_author(book['author_name'])
+        cover_path = get_cache(self.cfg, book['cover'])
+        with open(cover_path, 'rb') as f:
+            cover = f.read()
+        if have_magic:
+            mime_type = magic.from_buffer(cover, mime=True)
+            file_name = 'cover' + get_extension(mime_type)
+        else:
+            file_name = 'cover.png'
+        self.epub.set_cover(file_name, cover)
+        intro = epub.EpubHtml(title='book-detailed', file_name='intro.xhtml',
+                              lang='zh-CN')
+        intro.content = f'''<html><head></head><body>
+<h1>书名：{book['book_name']}</h1><p>ID：{book['book_id']}</p>
+<p>作者：{book['author_name']}</p><p>更新时间：{book['uptime']}</p>
+<p>最新章节：{book['last_chapter_info']['chapter_title']}</p></body></html>'''
+        self.epub.add_item(intro)
+        self.EpubList.append(intro)
+        self.epub.spine.append(intro)
+        self.last_division_name = ''
+
+    def add_chapter(self, chapter, content: str, division_name: str):
+        chapter_title = chapter['chapter_title']
+        chapter_id = chapter['chapter_id']
+        ch = epub.EpubHtml(
+            title=chapter_title,
+            file_name=f'ch{chapter_id}.xhtml',
+            lang='zh-CN',
+            uid=f'ch{chapter_id}',
+        )
+        if division_name == '作品相关':
+            ch.is_linear = False
+        parser = ContentParser(self.cfg)
+        contents = content.splitlines()
+        try:
+            parser.feed('<p>' + '</p>\n<p>'.join(contents) + '</p>')
+        except Exception as e:
+            print('<p>' + '</p>\n<p>'.join(contents) + '</p>')
+            raise e
+        parser.close()
+        ch.content = f'<h1 style="text-align: center;">{chapter_title}</h1>\n{parser.to_local()}'  # noqa: E501
+        self.epub.add_item(ch)
+        count = 0
+        for oimg in parser.images:
+            with open(oimg.path, 'rb') as f:
+                img_content = f.read()
+            img = EpubImage()
+            img.file_name = oimg.epub_path
+            img.content = img_content
+            img.id = f'i{chapter_id}_{count}'
+            self.epub.add_item(img)
+            if oimg.epub_path.endswith('.webp'):
+                img.media_type = 'image/webp'
+                jpg_path = perform_convert(oimg.path)
+                if jpg_path is not None:
+                    jpg_img = epub.EpubImage()
+                    with open(jpg_path, 'rb') as f:
+                        jpg_content = f.read()
+                    jpg_img.file_name = os.path.basename(jpg_path)
+                    jpg_img.content = jpg_content
+                    jpg_img.id = img.id + 'f'
+                    img.fallback = jpg_img.id
+                    self.epub.add_item(jpg_img)
+            count += 1
+        if self.last_division_name != division_name:
+            self.EpubList.append([epub.Link(ch.file_name, division_name), []])
+            self.last_division_name = division_name
+        if isinstance(self.EpubList[-1], list):
+            self.EpubList[-1][-1].append(ch)
+        else:
+            self.EpubList.append(ch)
+        self.epub.spine.append(ch)
+
+    def save_epub_file(self):  # save epub file to local
+        # the path to save epub file to local
+        self.epub.toc = self.EpubList
+        self.epub.add_item(epub.EpubNcx()), self.epub.add_item(epub.EpubNav())
+        book = EpubWriter(self.out, self.epub, {})
+        book.process()
+        try:
+            book.write()
+        except IOError:
+            print_exc()
--- a/export.py
+++ b/export.py
@@ -6,6 +6,7 @@ from booksnew import BooksNew
 from crypto import decrypt
 from os.path import dirname
 from os import makedirs
+from epub import EpubFile


 key_imported = False
@@ -56,6 +57,14 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew,
        d = dirname(txt_filename)
        makedirs(d, exist_ok=True)
        txt = open(txt_filename, 'w', encoding='UTF-8')
+    if cfg.export_epub:
+        try:
+            epub = EpubFile(cfg, cfg.get_export_book(book, 'epub'))
+            epub.set_book(book)
+        except Exception as e:
+            if cfg.export_txt:
+                txt.close()
+            raise e
    try:
        chapters = ncw.get_chapter_with_bookid(book_id)
        divisions = ncw.get_divisions_with_bookid(book_id)
@@ -68,10 +77,13 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew,
            else:
                maps[division_id] = [chapter]
        for division in divisions:
+            division_name = division['division_name']
            if cfg.export_txt:
-                txt.write(f"第{division['division_index']}卷 {division['division_name']}\n")  # noqa: E501
+                txt.write(f"第{division['division_index']}卷 {division_name}\n")
                if division['description']:
                    txt.write(division['description'] + '\n\n')
+            if cfg.export_epub and division['description']:
+                print('TODO: add division description to epub.')
            chapter_index = 1
            for chapter in maps[division['division_id']]:
                if chapter['is_download']:
@@ -82,6 +94,8 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew,
                    if cfg.export_txt:
                        txt.write(f"第{chapter_index}章 {chapter_title}\n")
                        txt.write(content + '\n\n')
+                    if cfg.export_epub:
+                        epub.add_chapter(chapter, content, division_name)
                    count += 1
                else:
                    if cfg.export_txt:
@@ -91,3 +105,5 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew,
    finally:
        if cfg.export_txt:
            txt.close()
+        if cfg.export_epub:
+            epub.save_epub_file()
--- a/image_cache.py
+++ b/image_cache.py
@@ -0,0 +1,34 @@
+from urllib.parse import urlparse
+from os.path import exists, join, dirname
+from os import makedirs
+from config import Config
+import requests
+
+
+def try_fetch(url):
+    for _ in range(5):
+        try:
+            re = requests.get(url=url)
+            if re.status_code == 200:
+                return re.content
+        except Exception:
+            pass
+        raise ValueError(f'HTTP ERROR {re.status_code} {re.reason}.')
+    raise ValueError('Failed to fetch the image.')
+
+
+def get_cache(cfg: Config, url: str):
+    u = urlparse(url)
+    path = u.path
+    if path.endswith('/'):
+        path = path[:-1]
+    path = join(cfg.img_cache_dir, path[1:])
+    if exists(path):
+        return path
+    else:
+        img = try_fetch(url)
+        d = dirname(path)
+        makedirs(d, exist_ok=True)
+        with open(path, 'wb') as f:
+            f.write(img)
+        return path
--- a/main.py
+++ b/main.py
@@ -17,6 +17,7 @@ parser.add_argument('-r', '--real', help='Use default locations. Needed running
 parser.add_argument('-B', '--bid', '--book-id', help='The book id.', type=int)
 parser.add_argument('-t', '--type', help='Export type. Available types: epub, txt. Default: epub,txt')  # noqa: E501
 parser.add_argument('--ebt', '--export-book-template', help='The template of the exported book. Available key: <ext>, <book_id>, <book_name>, <author_name> eta.')  # noqa: E501
+parser.add_argument('--icd', '--image-cache-dir', help='Path to image cache directory.')  # noqa: E501
 parser.add_argument('action', help='The action to do.', choices=['importkey', 'exportchapter', 'exportbook'])  # noqa: E501


--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,7 @@
 PyCryptodome
 semver
+ebooklib
+lxml
+python-magic
+python-magic-bin ; sys_platform == 'win32'
+requests