diff --git a/m3tpostprocess.py b/m3tpostprocess.py new file mode 100644 index 0000000..bcb1c58 --- /dev/null +++ b/m3tpostprocess.py @@ -0,0 +1,129 @@ +import re +import os +import math + + +ELLIPSIS = re.compile(r'\.{3,}') +ELLIPSIS_ZH = re.compile(r'…+') +HYPHEN_ZH = re.compile(r'—+') +HALF_FULL_MAP = ( + (',', ','), + ('.', '。'), + ('!', '!'), + ('?', '?'), + ('~', '~'), + ('(', '('), + (')', ')'), + (':', ':'), + (';', ';'), + ('-', '—'), + ('ー', '—'), + ('&', '&'), + ('%', '%'), + ('・', '·'), +) +MIXED_RULE1 = re.compile(r'[,。、]+([…—~]+)') +MIXED_RULE2 = re.compile(r'([…—~]+)[,。、]+') +MIXED_RULE3 = re.compile(f'([…—~])[…—~]+') +# 平假名 +HIRAGANA = re.compile(r'[\u3040-\u309F]+') +# 片假名 +KATAKANA = re.compile(r'[\u30A0-\u30FF]+') +# 狗屎wamsoft在scn里拉的指令 +SCN_SPECIAL = re.compile(r'(%(?:[^%;]*;|r)|#[0-9a-fA-F]{6,8};)') + + +def process_line(data: str) -> str: + # 使用省略号替换连续的点 + data = ELLIPSIS.sub(lambda m: '…' * math.ceil(len(m.group(0)) / 3), data) + for half, full in HALF_FULL_MAP: + data = data.replace(half, full) + #,。、和…—~混用 + data = MIXED_RULE1.sub(lambda m: m.group(1), data) + data = MIXED_RULE2.sub(lambda m: m.group(1), data) + # 混用时…—~只保留一个 + data = MIXED_RULE3.sub(lambda m: m.group(1) * len(m.group(0)), data) + # 省略号和破折号统一只留一对 + data = ELLIPSIS_ZH.sub('……', data) + data = HYPHEN_ZH.sub('——', data) + # 删除所有平假名 + data = HIRAGANA.sub('', data) + # 删除所有片假名 + data = KATAKANA.sub('', data) + data = data.replace('!?', '?!') + data = data.replace('。」', '」') + return data + + +def process_scn_line(data: str) -> str: + parts = SCN_SPECIAL.split(data) + for i in range(len(parts)): + if i % 2 == 0: + parts[i] = process_line(parts[i]) + # 特殊处理 。%command;」 + if i > 0 and parts[i].startswith('」'): + while parts[i - 2].endswith('。'): + parts[i - 2] = parts[i - 2][:-1] + return ''.join(parts) + + +def process_m3t(m3t_path, scn=False): + lines = [] + line_num = 0 + for line in open(m3t_path, "r", encoding="utf-8"): + line = line.strip() + line = line.replace('\u200b', '') # 移除零宽空格 + if line_num == 0: + line = line.lstrip("\ufeff") # 移除可能的BOM + line_num += 1 + if line.startswith("●") or line.startswith("△"): + data = line[1:].strip() + if scn: + ndata = process_scn_line(data) + else: + ndata = process_line(data) + if data != ndata: + print(f"{m3t_path}:{line_num}: {data} -> {ndata}") + line = line[0] + " " + ndata + lines.append(line) + return lines + + +def save_lines(m3t_path, lines): + with open(m3t_path, "w", encoding="utf-8") as f: + for line in lines: + f.write(line + "\n") + + +def process_m3t_file(m3t_path, target_path=None, dry_run=False, scn=False): + lines = process_m3t(m3t_path, scn) + if not dry_run: + if target_path is None: + save_lines(m3t_path, lines) + else: + os.makedirs(os.path.dirname(target_path), exist_ok=True) + save_lines(target_path, lines) + + +def recursive_process_m3t(m3t_path, target_path=None, dry_run=False, scn=False): + for root, dirs, files in os.walk(m3t_path): + for file in files: + full_path = os.path.join(root, file) + output_path = None + if target_path is not None: + output_path = os.path.join(target_path, file) + process_m3t_file(full_path, output_path, dry_run, scn) + + +if __name__ == "__main__": + from argparse import ArgumentParser + parser = ArgumentParser(description="Post-process M3T files.") + parser.add_argument("m3t_path", help="Path to M3T file/directory.") + parser.add_argument("target_path", nargs="?", default=None, help="Path to save processed M3T file/directory. If not specified, overwrite original files.") + parser.add_argument("-d", "--dry-run", action="store_true", help="Only show changes without saving.") + parser.add_argument("--scn", action="store_true", help="Process SCN lines with special rules.") + args = parser.parse_args() + if os.path.isdir(args.m3t_path): + recursive_process_m3t(args.m3t_path, args.target_path, args.dry_run, args.scn) + else: + process_m3t_file(args.m3t_path, args.target_path, args.dry_run, args.scn) diff --git a/m3tpostprocess_unittest.py b/m3tpostprocess_unittest.py new file mode 100644 index 0000000..f0a2413 --- /dev/null +++ b/m3tpostprocess_unittest.py @@ -0,0 +1,27 @@ +from m3tpostprocess import process_line, process_scn_line + + +def test_process_line(src, expected): + processed = process_line(src) + if processed != expected: + raise AssertionError(f"Expected: {expected}, but got: {processed}") + + +def test_process_scn_line(src, expected): + processed = process_scn_line(src) + if processed != expected: + raise AssertionError(f"Expected: {expected}, but got: {processed}") + + +if __name__ == "__main__": + test_process_line("Hello....", "Hello……") + test_process_line("……。", "……") + test_process_line(",。……", "……") + test_process_line("…—~", "……") + test_process_line("----~...", "——") + test_process_line("~~--", "~~~~") + test_process_line("!?!?", "?!?!") + test_process_line("これはテストです测试。", "测试。") + test_process_line("「地形模型。」", "「地形模型」") + test_process_scn_line("「地形模型。%command;」", "「地形模型%command;」") + test_process_scn_line("%cmd;25%的可能。", "%cmd;25%的可能。")