Add epub support
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -162,3 +162,4 @@ cython_debug/
|
||||
*.json
|
||||
*.db
|
||||
exported/
|
||||
img_cache/
|
||||
|
||||
10
README.md
Normal file
10
README.md
Normal file
@@ -0,0 +1,10 @@
|
||||
# Setup
|
||||
## Termux
|
||||
```shell
|
||||
pkg install git libxml2 libxslt python ffmpeg
|
||||
git clone https://github.com/lifegpc/cwm_export
|
||||
cd cwm_export
|
||||
python -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@@ -35,6 +35,8 @@ class Config:
|
||||
if key in self._data:
|
||||
return self._data[key]
|
||||
else:
|
||||
if default is not None:
|
||||
self._data[key] = default
|
||||
return default
|
||||
|
||||
@cached_property
|
||||
@@ -90,6 +92,10 @@ class Config:
|
||||
temp = temp.replace(f'<{k}>', str(chapter[k]))
|
||||
return temp
|
||||
|
||||
@cached_property
|
||||
def img_cache_dir(self):
|
||||
return self.get_arg('img_cache_dir', 'img_cache')
|
||||
|
||||
@cached_property
|
||||
def key(self):
|
||||
return self.get_arg('key', None)
|
||||
|
||||
359
epub.py
Normal file
359
epub.py
Normal file
@@ -0,0 +1,359 @@
|
||||
from ebooklib import epub, ITEM_IMAGE
|
||||
from lxml import etree
|
||||
import subprocess
|
||||
from typing import Optional
|
||||
import os
|
||||
from config import Config
|
||||
from image_cache import get_cache
|
||||
from traceback import print_exc
|
||||
import xml.etree.ElementTree as ET
|
||||
from html.parser import HTMLParser
|
||||
try:
|
||||
import magic
|
||||
have_magic = True
|
||||
except ImportError:
|
||||
have_magic = False
|
||||
print('Warning: python-magic not found. The mimetype in EPUB file may wrong.') # noqa: E501
|
||||
import platform
|
||||
if platform.system() == "Windows":
|
||||
print('python-magic-bin is also needed on Windows.')
|
||||
|
||||
|
||||
# Add fallback property to ebooklib
|
||||
# ebooklib does not support fallback
|
||||
class EpubItem(epub.EpubItem):
|
||||
def __init__(self, uid=None, file_name='', media_type='',
|
||||
content=epub.six.b(''), manifest=True):
|
||||
super().__init__(uid, file_name, media_type, content, manifest)
|
||||
self.fallback = None
|
||||
|
||||
|
||||
class EpubImage(EpubItem):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def get_type(self):
|
||||
return ITEM_IMAGE
|
||||
|
||||
def __str__(self):
|
||||
return '<EpubImage:%s:%s>' % (self.id, self.file_name)
|
||||
|
||||
|
||||
class EpubWriter(epub.EpubWriter):
|
||||
def _write_opf_manifest(self, root):
|
||||
manifest = epub.etree.SubElement(root, 'manifest')
|
||||
_ncx_id = None
|
||||
|
||||
for item in self.book.get_items():
|
||||
if not item.manifest:
|
||||
continue
|
||||
|
||||
if isinstance(item, epub.EpubNav):
|
||||
etree.SubElement(manifest, 'item', {'href': item.get_name(),
|
||||
'id': item.id,
|
||||
'media-type': item.media_type, # noqa: E501
|
||||
'properties': 'nav'})
|
||||
elif isinstance(item, epub.EpubNcx):
|
||||
_ncx_id = item.id
|
||||
etree.SubElement(manifest, 'item', {'href': item.file_name,
|
||||
'id': item.id,
|
||||
'media-type': item.media_type}) # noqa: E501
|
||||
|
||||
elif isinstance(item, epub.EpubCover):
|
||||
etree.SubElement(manifest, 'item', {'href': item.file_name,
|
||||
'id': item.id,
|
||||
'media-type': item.media_type, # noqa: E501
|
||||
'properties': 'cover-image'}) # noqa: E501
|
||||
else:
|
||||
opts = {'href': item.file_name,
|
||||
'id': item.id,
|
||||
'media-type': item.media_type}
|
||||
|
||||
if hasattr(item, 'properties') and len(item.properties) > 0:
|
||||
opts['properties'] = ' '.join(item.properties)
|
||||
|
||||
if hasattr(item, 'media_overlay') and item.media_overlay is not None: # noqa: E501
|
||||
opts['media-overlay'] = item.media_overlay
|
||||
|
||||
if hasattr(item, 'media_duration') and item.media_duration is not None: # noqa: E501
|
||||
opts['duration'] = item.media_duration
|
||||
|
||||
if hasattr(item, 'fallback') and item.fallback is not None:
|
||||
opts['fallback'] = item.fallback
|
||||
|
||||
etree.SubElement(manifest, 'item', opts)
|
||||
|
||||
return _ncx_id
|
||||
|
||||
|
||||
have_ffmpeg = None
|
||||
|
||||
|
||||
def check_ffmpeg():
|
||||
p = subprocess.Popen(['ffmpeg', '-h'], stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL)
|
||||
p.communicate()
|
||||
global have_ffmpeg
|
||||
have_ffmpeg = not p.wait()
|
||||
if not have_ffmpeg:
|
||||
print('Warning: Can not find ffmpeg. Some epub readers may failed to open these pictures.') # noqa: E501
|
||||
|
||||
|
||||
def perform_convert(image_path: str) -> Optional[str]:
|
||||
output_path = os.path.splitext(image_path)[0] + '_fallback.jpg'
|
||||
if os.path.exists(output_path) and os.path.getsize(output_path) > 4096:
|
||||
return output_path
|
||||
if have_ffmpeg is None:
|
||||
check_ffmpeg()
|
||||
if have_ffmpeg:
|
||||
p = subprocess.Popen(['ffmpeg', '-y', '-i', image_path, output_path],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL)
|
||||
p.communicate()
|
||||
code = p.wait()
|
||||
if not code:
|
||||
return output_path
|
||||
else:
|
||||
print(f'Warning: Can not convert images by using ffmpeg. (Exit code: {code}) Some epub readers may failed to open these pictures.') # noqa: E501
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
class HTMLImage:
|
||||
def __init__(self, attrs, cfg: Config):
|
||||
self.src = None
|
||||
self.alt = None
|
||||
self.path = None
|
||||
self.epub_path = None
|
||||
for key, value in attrs:
|
||||
if key == 'src':
|
||||
self.src = value
|
||||
elif key == 'alt':
|
||||
self.alt = value
|
||||
self.cfg = cfg
|
||||
|
||||
def is_valid(self):
|
||||
return self.src is not None
|
||||
|
||||
def download_image(self):
|
||||
if not self.is_valid():
|
||||
return False
|
||||
try:
|
||||
self.path = get_cache(self.cfg, self.src)
|
||||
return True
|
||||
except Exception:
|
||||
print_exc()
|
||||
|
||||
def to_local(self):
|
||||
if not self.is_valid():
|
||||
return ""
|
||||
if not self.download_image():
|
||||
raise ValueError("Failed to download image.")
|
||||
self.epub_path = os.path.basename(self.path)
|
||||
if have_magic:
|
||||
with open(self.path, 'rb') as f:
|
||||
mime = magic.from_buffer(f.read(4096), True)
|
||||
self.epub_path = os.path.splitext(self.epub_path)[0] + get_extension(mime) # noqa: E501
|
||||
d = {'src': self.epub_path}
|
||||
if self.alt:
|
||||
d['alt'] = self.alt
|
||||
return ET.tostring(ET.Element('img', d), 'unicode')
|
||||
|
||||
|
||||
# Used to parse content
|
||||
class ContentParser(HTMLParser):
|
||||
def __init__(self, cfg: Config):
|
||||
super().__init__()
|
||||
self._in_paragraph = False
|
||||
self.data = []
|
||||
# Local image file lists
|
||||
self.images = []
|
||||
self._paragraph_data = ''
|
||||
self.cfg = cfg
|
||||
|
||||
def handle_data(self, data: str):
|
||||
if self._in_paragraph:
|
||||
if isinstance(self._paragraph_data, str):
|
||||
self._paragraph_data += data
|
||||
elif isinstance(self._paragraph_data, list):
|
||||
self._paragraph_data.append(data)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'img':
|
||||
if self._in_paragraph:
|
||||
if self._paragraph_data:
|
||||
self._paragraph_data = [self._paragraph_data]
|
||||
else:
|
||||
self._paragraph_data = []
|
||||
self._paragraph_data.append(HTMLImage(attrs, self.cfg))
|
||||
else:
|
||||
self.data.append(HTMLImage(attrs, self.cfg))
|
||||
elif tag == 'p':
|
||||
self._in_paragraph = True
|
||||
elif tag == 'book':
|
||||
pass
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def handle_endtag(self, tag: str):
|
||||
if tag == 'img':
|
||||
pass
|
||||
elif tag == 'p':
|
||||
self._in_paragraph = False
|
||||
if self._paragraph_data:
|
||||
self.data.append(self._paragraph_data)
|
||||
self._paragraph_data = ''
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def have_image(self, data_list=None) -> bool:
|
||||
if data_list is None:
|
||||
data_list = self.data
|
||||
for i in data_list:
|
||||
if isinstance(i, HTMLImage):
|
||||
if i.is_valid():
|
||||
return True
|
||||
elif isinstance(i, list):
|
||||
if self.have_image(i):
|
||||
return True
|
||||
return False
|
||||
|
||||
def to_local(self, data_list=None) -> str:
|
||||
default_data_list = False
|
||||
if data_list is None:
|
||||
data_list = self.data
|
||||
default_data_list = True
|
||||
data = ''
|
||||
for i in data_list:
|
||||
if isinstance(i, str):
|
||||
if default_data_list:
|
||||
data += f'<p>{i}</p>\n'
|
||||
else:
|
||||
data += i
|
||||
elif isinstance(i, HTMLImage):
|
||||
if i.is_valid():
|
||||
try:
|
||||
data += i.to_local()
|
||||
self.images.append(i)
|
||||
except ValueError:
|
||||
print("the image is not valid.", i.src)
|
||||
elif isinstance(i, list):
|
||||
data += f'<p>{self.to_local(i)}</p>\n'
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
if self._paragraph_data:
|
||||
data += f'<p>{self._paragraph_data}</p>\n'
|
||||
return data
|
||||
|
||||
|
||||
def get_extension(mime: str) -> str:
|
||||
if mime == 'image/gif':
|
||||
return '.gif'
|
||||
elif mime == 'image/jpeg':
|
||||
return '.jpeg'
|
||||
elif mime == 'image/png':
|
||||
return '.png'
|
||||
elif mime == 'image/svg+xml':
|
||||
return '.svg'
|
||||
elif mime == 'image/webp': # EPUB 3.3 Draft
|
||||
return '.webp'
|
||||
else:
|
||||
print(mime)
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class EpubFile:
|
||||
def __init__(self, cfg: Config, out: str):
|
||||
self.epub = epub.EpubBook()
|
||||
self.EpubList = list()
|
||||
self.epub.set_language('zh-CN')
|
||||
self.cfg = cfg
|
||||
self.out = out
|
||||
|
||||
def set_book(self, book):
|
||||
self.epub.set_identifier(str(book['book_id']))
|
||||
self.epub.set_title(book['book_name'])
|
||||
self.epub.add_author(book['author_name'])
|
||||
cover_path = get_cache(self.cfg, book['cover'])
|
||||
with open(cover_path, 'rb') as f:
|
||||
cover = f.read()
|
||||
if have_magic:
|
||||
mime_type = magic.from_buffer(cover, mime=True)
|
||||
file_name = 'cover' + get_extension(mime_type)
|
||||
else:
|
||||
file_name = 'cover.png'
|
||||
self.epub.set_cover(file_name, cover)
|
||||
intro = epub.EpubHtml(title='book-detailed', file_name='intro.xhtml',
|
||||
lang='zh-CN')
|
||||
intro.content = f'''<html><head></head><body>
|
||||
<h1>书名:{book['book_name']}</h1><p>ID:{book['book_id']}</p>
|
||||
<p>作者:{book['author_name']}</p><p>更新时间:{book['uptime']}</p>
|
||||
<p>最新章节:{book['last_chapter_info']['chapter_title']}</p></body></html>'''
|
||||
self.epub.add_item(intro)
|
||||
self.EpubList.append(intro)
|
||||
self.epub.spine.append(intro)
|
||||
self.last_division_name = ''
|
||||
|
||||
def add_chapter(self, chapter, content: str, division_name: str):
|
||||
chapter_title = chapter['chapter_title']
|
||||
chapter_id = chapter['chapter_id']
|
||||
ch = epub.EpubHtml(
|
||||
title=chapter_title,
|
||||
file_name=f'ch{chapter_id}.xhtml',
|
||||
lang='zh-CN',
|
||||
uid=f'ch{chapter_id}',
|
||||
)
|
||||
if division_name == '作品相关':
|
||||
ch.is_linear = False
|
||||
parser = ContentParser(self.cfg)
|
||||
contents = content.splitlines()
|
||||
try:
|
||||
parser.feed('<p>' + '</p>\n<p>'.join(contents) + '</p>')
|
||||
except Exception as e:
|
||||
print('<p>' + '</p>\n<p>'.join(contents) + '</p>')
|
||||
raise e
|
||||
parser.close()
|
||||
ch.content = f'<h1 style="text-align: center;">{chapter_title}</h1>\n{parser.to_local()}' # noqa: E501
|
||||
self.epub.add_item(ch)
|
||||
count = 0
|
||||
for oimg in parser.images:
|
||||
with open(oimg.path, 'rb') as f:
|
||||
img_content = f.read()
|
||||
img = EpubImage()
|
||||
img.file_name = oimg.epub_path
|
||||
img.content = img_content
|
||||
img.id = f'i{chapter_id}_{count}'
|
||||
self.epub.add_item(img)
|
||||
if oimg.epub_path.endswith('.webp'):
|
||||
img.media_type = 'image/webp'
|
||||
jpg_path = perform_convert(oimg.path)
|
||||
if jpg_path is not None:
|
||||
jpg_img = epub.EpubImage()
|
||||
with open(jpg_path, 'rb') as f:
|
||||
jpg_content = f.read()
|
||||
jpg_img.file_name = os.path.basename(jpg_path)
|
||||
jpg_img.content = jpg_content
|
||||
jpg_img.id = img.id + 'f'
|
||||
img.fallback = jpg_img.id
|
||||
self.epub.add_item(jpg_img)
|
||||
count += 1
|
||||
if self.last_division_name != division_name:
|
||||
self.EpubList.append([epub.Link(ch.file_name, division_name), []])
|
||||
self.last_division_name = division_name
|
||||
if isinstance(self.EpubList[-1], list):
|
||||
self.EpubList[-1][-1].append(ch)
|
||||
else:
|
||||
self.EpubList.append(ch)
|
||||
self.epub.spine.append(ch)
|
||||
|
||||
def save_epub_file(self): # save epub file to local
|
||||
# the path to save epub file to local
|
||||
self.epub.toc = self.EpubList
|
||||
self.epub.add_item(epub.EpubNcx()), self.epub.add_item(epub.EpubNav())
|
||||
book = EpubWriter(self.out, self.epub, {})
|
||||
book.process()
|
||||
try:
|
||||
book.write()
|
||||
except IOError:
|
||||
print_exc()
|
||||
18
export.py
18
export.py
@@ -6,6 +6,7 @@ from booksnew import BooksNew
|
||||
from crypto import decrypt
|
||||
from os.path import dirname
|
||||
from os import makedirs
|
||||
from epub import EpubFile
|
||||
|
||||
|
||||
key_imported = False
|
||||
@@ -56,6 +57,14 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew,
|
||||
d = dirname(txt_filename)
|
||||
makedirs(d, exist_ok=True)
|
||||
txt = open(txt_filename, 'w', encoding='UTF-8')
|
||||
if cfg.export_epub:
|
||||
try:
|
||||
epub = EpubFile(cfg, cfg.get_export_book(book, 'epub'))
|
||||
epub.set_book(book)
|
||||
except Exception as e:
|
||||
if cfg.export_txt:
|
||||
txt.close()
|
||||
raise e
|
||||
try:
|
||||
chapters = ncw.get_chapter_with_bookid(book_id)
|
||||
divisions = ncw.get_divisions_with_bookid(book_id)
|
||||
@@ -68,10 +77,13 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew,
|
||||
else:
|
||||
maps[division_id] = [chapter]
|
||||
for division in divisions:
|
||||
division_name = division['division_name']
|
||||
if cfg.export_txt:
|
||||
txt.write(f"第{division['division_index']}卷 {division['division_name']}\n") # noqa: E501
|
||||
txt.write(f"第{division['division_index']}卷 {division_name}\n")
|
||||
if division['description']:
|
||||
txt.write(division['description'] + '\n\n')
|
||||
if cfg.export_epub and division['description']:
|
||||
print('TODO: add division description to epub.')
|
||||
chapter_index = 1
|
||||
for chapter in maps[division['division_id']]:
|
||||
if chapter['is_download']:
|
||||
@@ -82,6 +94,8 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew,
|
||||
if cfg.export_txt:
|
||||
txt.write(f"第{chapter_index}章 {chapter_title}\n")
|
||||
txt.write(content + '\n\n')
|
||||
if cfg.export_epub:
|
||||
epub.add_chapter(chapter, content, division_name)
|
||||
count += 1
|
||||
else:
|
||||
if cfg.export_txt:
|
||||
@@ -91,3 +105,5 @@ def export_book(ncw: NovelCiwei, db: CwmDb, cfg: Config, bn: BooksNew,
|
||||
finally:
|
||||
if cfg.export_txt:
|
||||
txt.close()
|
||||
if cfg.export_epub:
|
||||
epub.save_epub_file()
|
||||
|
||||
34
image_cache.py
Normal file
34
image_cache.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from urllib.parse import urlparse
|
||||
from os.path import exists, join, dirname
|
||||
from os import makedirs
|
||||
from config import Config
|
||||
import requests
|
||||
|
||||
|
||||
def try_fetch(url):
|
||||
for _ in range(5):
|
||||
try:
|
||||
re = requests.get(url=url)
|
||||
if re.status_code == 200:
|
||||
return re.content
|
||||
except Exception:
|
||||
pass
|
||||
raise ValueError(f'HTTP ERROR {re.status_code} {re.reason}.')
|
||||
raise ValueError('Failed to fetch the image.')
|
||||
|
||||
|
||||
def get_cache(cfg: Config, url: str):
|
||||
u = urlparse(url)
|
||||
path = u.path
|
||||
if path.endswith('/'):
|
||||
path = path[:-1]
|
||||
path = join(cfg.img_cache_dir, path[1:])
|
||||
if exists(path):
|
||||
return path
|
||||
else:
|
||||
img = try_fetch(url)
|
||||
d = dirname(path)
|
||||
makedirs(d, exist_ok=True)
|
||||
with open(path, 'wb') as f:
|
||||
f.write(img)
|
||||
return path
|
||||
1
main.py
1
main.py
@@ -17,6 +17,7 @@ parser.add_argument('-r', '--real', help='Use default locations. Needed running
|
||||
parser.add_argument('-B', '--bid', '--book-id', help='The book id.', type=int)
|
||||
parser.add_argument('-t', '--type', help='Export type. Available types: epub, txt. Default: epub,txt') # noqa: E501
|
||||
parser.add_argument('--ebt', '--export-book-template', help='The template of the exported book. Available key: <ext>, <book_id>, <book_name>, <author_name> eta.') # noqa: E501
|
||||
parser.add_argument('--icd', '--image-cache-dir', help='Path to image cache directory.') # noqa: E501
|
||||
parser.add_argument('action', help='The action to do.', choices=['importkey', 'exportchapter', 'exportbook']) # noqa: E501
|
||||
|
||||
|
||||
|
||||
@@ -1,2 +1,7 @@
|
||||
PyCryptodome
|
||||
semver
|
||||
ebooklib
|
||||
lxml
|
||||
python-magic
|
||||
python-magic-bin ; sys_platform == 'win32'
|
||||
requests
|
||||
|
||||
Reference in New Issue
Block a user