Add postprocess script
This commit is contained in:
129
m3tpostprocess.py
Normal file
129
m3tpostprocess.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import re
|
||||
import os
|
||||
import math
|
||||
|
||||
|
||||
ELLIPSIS = re.compile(r'\.{3,}')
|
||||
ELLIPSIS_ZH = re.compile(r'…+')
|
||||
HYPHEN_ZH = re.compile(r'—+')
|
||||
HALF_FULL_MAP = (
|
||||
(',', ','),
|
||||
('.', '。'),
|
||||
('!', '!'),
|
||||
('?', '?'),
|
||||
('~', '~'),
|
||||
('(', '('),
|
||||
(')', ')'),
|
||||
(':', ':'),
|
||||
(';', ';'),
|
||||
('-', '—'),
|
||||
('ー', '—'),
|
||||
('&', '&'),
|
||||
('%', '%'),
|
||||
('・', '·'),
|
||||
)
|
||||
MIXED_RULE1 = re.compile(r'[,。、]+([…—~]+)')
|
||||
MIXED_RULE2 = re.compile(r'([…—~]+)[,。、]+')
|
||||
MIXED_RULE3 = re.compile(f'([…—~])[…—~]+')
|
||||
# 平假名
|
||||
HIRAGANA = re.compile(r'[\u3040-\u309F]+')
|
||||
# 片假名
|
||||
KATAKANA = re.compile(r'[\u30A0-\u30FF]+')
|
||||
# 狗屎wamsoft在scn里拉的指令
|
||||
SCN_SPECIAL = re.compile(r'(%(?:[^%;]*;|r)|#[0-9a-fA-F]{6,8};)')
|
||||
|
||||
|
||||
def process_line(data: str) -> str:
|
||||
# 使用省略号替换连续的点
|
||||
data = ELLIPSIS.sub(lambda m: '…' * math.ceil(len(m.group(0)) / 3), data)
|
||||
for half, full in HALF_FULL_MAP:
|
||||
data = data.replace(half, full)
|
||||
#,。、和…—~混用
|
||||
data = MIXED_RULE1.sub(lambda m: m.group(1), data)
|
||||
data = MIXED_RULE2.sub(lambda m: m.group(1), data)
|
||||
# 混用时…—~只保留一个
|
||||
data = MIXED_RULE3.sub(lambda m: m.group(1) * len(m.group(0)), data)
|
||||
# 省略号和破折号统一只留一对
|
||||
data = ELLIPSIS_ZH.sub('……', data)
|
||||
data = HYPHEN_ZH.sub('——', data)
|
||||
# 删除所有平假名
|
||||
data = HIRAGANA.sub('', data)
|
||||
# 删除所有片假名
|
||||
data = KATAKANA.sub('', data)
|
||||
data = data.replace('!?', '?!')
|
||||
data = data.replace('。」', '」')
|
||||
return data
|
||||
|
||||
|
||||
def process_scn_line(data: str) -> str:
|
||||
parts = SCN_SPECIAL.split(data)
|
||||
for i in range(len(parts)):
|
||||
if i % 2 == 0:
|
||||
parts[i] = process_line(parts[i])
|
||||
# 特殊处理 。%command;」
|
||||
if i > 0 and parts[i].startswith('」'):
|
||||
while parts[i - 2].endswith('。'):
|
||||
parts[i - 2] = parts[i - 2][:-1]
|
||||
return ''.join(parts)
|
||||
|
||||
|
||||
def process_m3t(m3t_path, scn=False):
|
||||
lines = []
|
||||
line_num = 0
|
||||
for line in open(m3t_path, "r", encoding="utf-8"):
|
||||
line = line.strip()
|
||||
line = line.replace('\u200b', '') # 移除零宽空格
|
||||
if line_num == 0:
|
||||
line = line.lstrip("\ufeff") # 移除可能的BOM
|
||||
line_num += 1
|
||||
if line.startswith("●") or line.startswith("△"):
|
||||
data = line[1:].strip()
|
||||
if scn:
|
||||
ndata = process_scn_line(data)
|
||||
else:
|
||||
ndata = process_line(data)
|
||||
if data != ndata:
|
||||
print(f"{m3t_path}:{line_num}: {data} -> {ndata}")
|
||||
line = line[0] + " " + ndata
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
|
||||
def save_lines(m3t_path, lines):
|
||||
with open(m3t_path, "w", encoding="utf-8") as f:
|
||||
for line in lines:
|
||||
f.write(line + "\n")
|
||||
|
||||
|
||||
def process_m3t_file(m3t_path, target_path=None, dry_run=False, scn=False):
|
||||
lines = process_m3t(m3t_path, scn)
|
||||
if not dry_run:
|
||||
if target_path is None:
|
||||
save_lines(m3t_path, lines)
|
||||
else:
|
||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||
save_lines(target_path, lines)
|
||||
|
||||
|
||||
def recursive_process_m3t(m3t_path, target_path=None, dry_run=False, scn=False):
|
||||
for root, dirs, files in os.walk(m3t_path):
|
||||
for file in files:
|
||||
full_path = os.path.join(root, file)
|
||||
output_path = None
|
||||
if target_path is not None:
|
||||
output_path = os.path.join(target_path, file)
|
||||
process_m3t_file(full_path, output_path, dry_run, scn)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from argparse import ArgumentParser
|
||||
parser = ArgumentParser(description="Post-process M3T files.")
|
||||
parser.add_argument("m3t_path", help="Path to M3T file/directory.")
|
||||
parser.add_argument("target_path", nargs="?", default=None, help="Path to save processed M3T file/directory. If not specified, overwrite original files.")
|
||||
parser.add_argument("-d", "--dry-run", action="store_true", help="Only show changes without saving.")
|
||||
parser.add_argument("--scn", action="store_true", help="Process SCN lines with special rules.")
|
||||
args = parser.parse_args()
|
||||
if os.path.isdir(args.m3t_path):
|
||||
recursive_process_m3t(args.m3t_path, args.target_path, args.dry_run, args.scn)
|
||||
else:
|
||||
process_m3t_file(args.m3t_path, args.target_path, args.dry_run, args.scn)
|
||||
27
m3tpostprocess_unittest.py
Normal file
27
m3tpostprocess_unittest.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from m3tpostprocess import process_line, process_scn_line
|
||||
|
||||
|
||||
def test_process_line(src, expected):
|
||||
processed = process_line(src)
|
||||
if processed != expected:
|
||||
raise AssertionError(f"Expected: {expected}, but got: {processed}")
|
||||
|
||||
|
||||
def test_process_scn_line(src, expected):
|
||||
processed = process_scn_line(src)
|
||||
if processed != expected:
|
||||
raise AssertionError(f"Expected: {expected}, but got: {processed}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_process_line("Hello....", "Hello……")
|
||||
test_process_line("……。", "……")
|
||||
test_process_line(",。……", "……")
|
||||
test_process_line("…—~", "……")
|
||||
test_process_line("----~...", "——")
|
||||
test_process_line("~~--", "~~~~")
|
||||
test_process_line("!?!?", "?!?!")
|
||||
test_process_line("これはテストです测试。", "测试。")
|
||||
test_process_line("「地形模型。」", "「地形模型」")
|
||||
test_process_scn_line("「地形模型。%command;」", "「地形模型%command;」")
|
||||
test_process_scn_line("%cmd;25%的可能。", "%cmd;25%的可能。")
|
||||
Reference in New Issue
Block a user