Files
pythonscript/pixiv_json_to_html.py
2026-04-22 12:41:11 +08:00

126 lines
5.7 KiB
Python

from os.path import join, isfile, basename, abspath
from json import load
from html import escape
def pixiv_json_to_html(dir_path):
json_path = join(dir_path, 'data.json')
with open(json_path, 'r', encoding='utf-8') as f:
data = load(f)
raw = data['raw']
title = raw['title']
html_data = f'<!DOCTYPE html><html><head><meta charset="utf-8"><title>{escape(title)}</title></head><body>\n'
html_data += f'<h1>{escape(title)}</h1>\n'
typ = raw['type']
img_idx = 1
used = set()
pbase = basename(abspath(dir_path))
if typ == 'article':
blocks = raw['body']['blocks']
imageMap = raw['body']['imageMap']
fileMap = raw['body']['fileMap']
urlEmbedMap = raw['body']['urlEmbedMap']
for block in blocks:
block_type = block['type']
if block_type == 'p':
text = block['text']
styles = []
if 'styles' in block:
styles = block['styles']
events = []
for style in styles:
if style['type'] == 'bold':
start = style['offset']
end = style['offset'] + style['length']
events.append((start, '<b>', False))
events.append((end, '</b>', True))
else:
raise ValueError(f'Unsupported style type: {style["type"]}')
events.sort(key=lambda x: (x[0], not x[2]))
output = ""
last_idx = 0
for pos, tag, is_closing in events:
output += escape(text[last_idx:pos])
output += tag
last_idx = pos
output += escape(text[last_idx:])
html_data += f'<p>{output}</p>\n'
elif block_type == 'header':
for key in block:
if key not in ['type', 'text']:
print(block)
raise ValueError(f'Unsupported header block key: {key}')
text = block['text']
html_data += f'<h2>{escape(text)}</h2>\n'
elif block_type == 'image':
image_id = block['imageId']
image_url = imageMap[image_id]['originalUrl']
image_ext = imageMap[image_id]['extension']
image_path = join(dir_path, f'{pbase}_{img_idx}.{image_ext}')
if not isfile(image_path):
print(image_path)
image_path = join(dir_path, f'{image_id}.{image_ext}')
image_path = basename(image_path)
html_data += f'<p><img src="{escape(image_path)}" data-original-url="{escape(image_url)}"></p>\n'
if image_id not in used:
used.add(image_id)
img_idx += 1
elif block_type == 'url_embed':
urlEmbedId = block['urlEmbedId']
embedData = urlEmbedMap[urlEmbedId]
embedType = embedData['type']
if embedType == 'fanbox.post':
postInfo = embedData['postInfo']
postTitle = postInfo['title']
postUrl = f'https://www.fanbox.cc/{postInfo['creatorId']}/posts/{postInfo['id']}'
postCover = None
if 'cover' in postInfo and postInfo['cover']['type'] == 'cover_image':
postCover = postInfo['cover']['url']
postExcerpt = postInfo['excerpt']
html_data += f'<div class="url-embed"><h2><a href="{escape(postUrl)}">{escape(postTitle)}</a></h2>'
if postCover:
html_data += f'<p><img src="{escape(postCover)}"></p>'
html_data += f'<p>{escape(postExcerpt)}</p></div>\n'
elif embedType == 'html.card':
embed_html_data = embedData['html']
html_data += f'<div class="url-embed">{embed_html_data}</div>\n'
elif embedType == 'html':
embed_html_data = embedData['html']
html_data += f'<div class="url-embed">{embed_html_data}</div>\n'
elif embedType == 'default':
url = embedData['url']
html_data += f'<div class="url-embed"><iframe src="{escape(url)}" /></div>\n'
else:
raise ValueError(f'Unsupported embed type: {embedType}')
else:
print(block)
raise ValueError(f'Unsupported block type: {block_type}')
elif typ == 'image':
text = raw['body']['text']
images = raw['body']['images']
html_data += f'<p>{escape(text).replace('\n', '<br>')}</p>\n'
for image in images:
image_id = image['id']
image_url = image['originalUrl']
image_ext = image['extension']
image_path = join(dir_path, f'{pbase}_{img_idx}.{image_ext}')
if not isfile(image_path):
print(image_path)
image_path = join(dir_path, f'{image_id}.{image_ext}')
image_path = basename(image_path)
html_data += f'<p><img src="{escape(image_path)}" data-original-url="{escape(image_url)}"></p>\n'
img_idx += 1
else:
raise ValueError(f'Unsupported body type: {typ}')
html_data += '</body></html>\n'
with open(join(dir_path, 'output.html'), 'w', encoding='utf-8') as f:
f.write(html_data)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Convert Pixiv JSON to HTML')
parser.add_argument('dir', help='Directory containing data.json')
args = parser.parse_args()
pixiv_json_to_html(args.dir)