Files
GalScripts/m3tpostprocess.py
2026-01-05 10:23:54 +08:00

208 lines
7.3 KiB
Python

import re
import os
import math
from enum import Enum
class ScriptType(str, Enum):
GENERIC = "generic"
SCN = "scn"
XML = "xml"
def __str__(self):
return self.value
@staticmethod
def from_string(s: str):
for stype in ScriptType:
if stype.value == s.lower():
return stype
raise ValueError(f"Unknown ScriptType: {s}")
def is_scn(self):
return self == ScriptType.SCN
def is_xml(self):
return self == ScriptType.XML
ELLIPSIS = re.compile(r'\.{3,}')
ELLIPSIS_ZH = re.compile(r'…+')
HYPHEN_ZH = re.compile(r'—+')
HALF_FULL_MAP = (
(',', ''),
('.', ''),
('!', ''),
('?', ''),
('~', ''),
('(', ''),
(')', ''),
(':', ''),
(';', ''),
('-', ''),
('', ''),
('&', ''),
('%', ''),
('', '·'),
)
MIXED_RULE1 = re.compile(r'[,。、]+([…—~]+)')
MIXED_RULE2 = re.compile(r'([…—~]+)[,。、]+')
MIXED_RULE3 = re.compile(r'([…—~])[…—~]+')
MIXED_RULE4 = re.compile(r'([,。、]+))$')
# 平假名
HIRAGANA = re.compile(r'[\u3040-\u309F]+')
# 片假名
KATAKANA = re.compile(r'[\u30A0-\u30FF]+')
# 狗屎wamsoft在scn里拉的指令
SCN_SPECIAL = re.compile(r'(%(?:[^%;]*;|r)|#[0-9a-fA-F]{6,8};)')
MUL_QUOTE_RULE = re.compile(r'"(.*)"')
SINGLE_QUOTE_RULE = re.compile(r"'(.*)'")
XML_TAG_SPECIAL = re.compile(r'(<[^>]+>)')
CJK_LEFT_QUOTE = ["", ""]
CJK_RIGHT_QUOTE = ["", ""]
NEED_REPLACED_LEFT_QUOTE = ["", ""]
NEED_REPLACED_RIGHT_QUOTE = ["", ""]
def process_line(data: str) -> str:
# 使用省略号替换连续的点
data = ELLIPSIS.sub(lambda m: '' * math.ceil(len(m.group(0)) / 3), data)
for half, full in HALF_FULL_MAP:
data = data.replace(half, full)
#,。、和…—~混用
data = MIXED_RULE1.sub(lambda m: m.group(1), data)
data = MIXED_RULE2.sub(lambda m: m.group(1), data)
# 混用时…—~只保留一个
data = MIXED_RULE3.sub(lambda m: m.group(1) * len(m.group(0)), data)
# 省略号和破折号统一只留一对
data = ELLIPSIS_ZH.sub('……', data)
data = HYPHEN_ZH.sub('——', data)
# 删除所有平假名
data = HIRAGANA.sub('', data)
# 删除所有片假名
data = KATAKANA.sub('', data)
data = data.replace('!?', '?!')
data = data.replace('。」', '')
# 去除)前多余的标点
data = MIXED_RULE4.sub('', data)
while MUL_QUOTE_RULE.search(data):
data = MUL_QUOTE_RULE.sub(lambda m: '' + m.group(1) + '', data)
while SINGLE_QUOTE_RULE.search(data):
data = SINGLE_QUOTE_RULE.sub(lambda m: '' + m.group(1) + '', data)
return data
def process_scn_line(data: str) -> str:
parts = SCN_SPECIAL.split(data)
for i in range(len(parts)):
if i % 2 == 0:
parts[i] = process_line(parts[i])
# 特殊处理 。%command;」
if i > 0 and parts[i].startswith(''):
while parts[i - 2].endswith(''):
parts[i - 2] = parts[i - 2][:-1]
return ''.join(parts)
def process_xml_line(data: str) -> str:
parts = XML_TAG_SPECIAL.split(data)
for i in range(len(parts)):
if i % 2 == 0:
parts[i] = process_line(parts[i])
if i > 0 and parts[i].startswith(''):
while parts[i - 2].endswith(''):
parts[i - 2] = parts[i - 2][:-1]
return ''.join(parts)
def replace_quote_str(s) -> str:
depth = 0
re = ""
for c in s:
if c in CJK_LEFT_QUOTE:
re += CJK_LEFT_QUOTE[depth % 2]
depth += 1
elif c in CJK_RIGHT_QUOTE:
depth -= 1
re += CJK_RIGHT_QUOTE[depth % 2]
elif c in NEED_REPLACED_LEFT_QUOTE:
re += CJK_LEFT_QUOTE[depth % 2]
depth += 1
elif c in NEED_REPLACED_RIGHT_QUOTE:
depth -= 1
re += CJK_RIGHT_QUOTE[depth % 2]
else:
re += c
return re
def process_m3t(m3t_path, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True):
lines = []
line_num = 0
for line in open(m3t_path, "r", encoding="utf-8"):
line = line.strip()
line = line.replace('\u200b', '') # 移除零宽空格
if line_num == 0:
line = line.lstrip("\ufeff") # 移除可能的BOM
line_num += 1
if (not llm_only and line.startswith("")) or line.startswith(""):
data = line[1:].strip()
if type.is_scn():
ndata = process_scn_line(data)
elif type.is_xml():
ndata = process_xml_line(data)
else:
ndata = process_line(data)
if replace_quotes:
ndata = replace_quote_str(ndata)
if data != ndata:
print(f"{m3t_path}:{line_num}: {data} -> {ndata}")
line = line[0] + " " + ndata
lines.append(line)
return lines
def save_lines(m3t_path, lines):
with open(m3t_path, "w", encoding="utf-8") as f:
for line in lines:
f.write(line + "\n")
def process_m3t_file(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True):
lines = process_m3t(m3t_path, type, llm_only, replace_quotes)
if not dry_run:
if target_path is None:
save_lines(m3t_path, lines)
else:
os.makedirs(os.path.dirname(target_path), exist_ok=True)
save_lines(target_path, lines)
def recursive_process_m3t(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, ext=".txt", replace_quotes=True):
for root, dirs, files in os.walk(m3t_path):
for file in files:
if not file.lower().endswith(ext):
continue
full_path = os.path.join(root, file)
output_path = None
if target_path is not None:
output_path = os.path.join(target_path, file)
process_m3t_file(full_path, output_path, dry_run, type, llm_only, replace_quotes)
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser(description="Post-process M3T files.")
parser.add_argument("m3t_path", help="Path to M3T file/directory.")
parser.add_argument("target_path", nargs="?", default=None, help="Path to save processed M3T file/directory. If not specified, overwrite original files.")
parser.add_argument("-d", "--dry-run", action="store_true", help="Only show changes without saving.")
parser.add_argument("--type", type=ScriptType.from_string, choices=list(ScriptType), help="Process lines with special rules.")
parser.add_argument("--llm-only", action="store_true", help="Only process LLM lines.")
parser.add_argument("--ext", default=".txt", help="File extension to process in directory mode.")
parser.add_argument("--no-replace-quotes", action="store_true", help="Do not replace quotes.")
args = parser.parse_args()
if os.path.isdir(args.m3t_path):
recursive_process_m3t(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, args.ext, not args.no_replace_quotes)
else:
process_m3t_file(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, not args.no_replace_quotes)