diff --git a/m3tpostprocess.py b/m3tpostprocess.py index bcb1c58..9adf6e8 100644 --- a/m3tpostprocess.py +++ b/m3tpostprocess.py @@ -1,6 +1,29 @@ import re import os import math +from enum import Enum + + +class ScriptType(str, Enum): + GENERIC = "generic" + SCN = "scn" + XML = "xml" + + def __str__(self): + return self.value + + @staticmethod + def from_string(s: str): + for stype in ScriptType: + if stype.value == s.lower(): + return stype + raise ValueError(f"Unknown ScriptType: {s}") + + def is_scn(self): + return self == ScriptType.SCN + + def is_xml(self): + return self == ScriptType.XML ELLIPSIS = re.compile(r'\.{3,}') @@ -24,13 +47,21 @@ HALF_FULL_MAP = ( ) MIXED_RULE1 = re.compile(r'[,。、]+([…—~]+)') MIXED_RULE2 = re.compile(r'([…—~]+)[,。、]+') -MIXED_RULE3 = re.compile(f'([…—~])[…—~]+') +MIXED_RULE3 = re.compile(r'([…—~])[…—~]+') +MIXED_RULE4 = re.compile(r'([,。、]+))$') # 平假名 HIRAGANA = re.compile(r'[\u3040-\u309F]+') # 片假名 KATAKANA = re.compile(r'[\u30A0-\u30FF]+') # 狗屎wamsoft在scn里拉的指令 SCN_SPECIAL = re.compile(r'(%(?:[^%;]*;|r)|#[0-9a-fA-F]{6,8};)') +MUL_QUOTE_RULE = re.compile(r'"(.*)"') +SINGLE_QUOTE_RULE = re.compile(r"'(.*)'") +XML_TAG_SPECIAL = re.compile(r'(<[^>]+>)') +CJK_LEFT_QUOTE = ["「", "『"] +CJK_RIGHT_QUOTE = ["」", "』"] +NEED_REPLACED_LEFT_QUOTE = ["“", "‘"] +NEED_REPLACED_RIGHT_QUOTE = ["”", "’"] def process_line(data: str) -> str: @@ -52,6 +83,12 @@ def process_line(data: str) -> str: data = KATAKANA.sub('', data) data = data.replace('!?', '?!') data = data.replace('。」', '」') + # 去除)前多余的标点 + data = MIXED_RULE4.sub(')', data) + while MUL_QUOTE_RULE.search(data): + data = MUL_QUOTE_RULE.sub(lambda m: '「' + m.group(1) + '」', data) + while SINGLE_QUOTE_RULE.search(data): + data = SINGLE_QUOTE_RULE.sub(lambda m: '『' + m.group(1) + '』', data) return data @@ -67,7 +104,39 @@ def process_scn_line(data: str) -> str: return ''.join(parts) -def process_m3t(m3t_path, scn=False): +def process_xml_line(data: str) -> str: + parts = XML_TAG_SPECIAL.split(data) + for i in range(len(parts)): + if i % 2 == 0: + parts[i] = process_line(parts[i]) + if i > 0 and parts[i].startswith('」'): + while parts[i - 2].endswith('。'): + parts[i - 2] = parts[i - 2][:-1] + return ''.join(parts) + + +def replace_quote_str(s) -> str: + depth = 0 + re = "" + for c in s: + if c in CJK_LEFT_QUOTE: + re += CJK_LEFT_QUOTE[depth % 2] + depth += 1 + elif c in CJK_RIGHT_QUOTE: + depth -= 1 + re += CJK_RIGHT_QUOTE[depth % 2] + elif c in NEED_REPLACED_LEFT_QUOTE: + re += CJK_LEFT_QUOTE[depth % 2] + depth += 1 + elif c in NEED_REPLACED_RIGHT_QUOTE: + depth -= 1 + re += CJK_RIGHT_QUOTE[depth % 2] + else: + re += c + return re + + +def process_m3t(m3t_path, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True): lines = [] line_num = 0 for line in open(m3t_path, "r", encoding="utf-8"): @@ -76,12 +145,16 @@ def process_m3t(m3t_path, scn=False): if line_num == 0: line = line.lstrip("\ufeff") # 移除可能的BOM line_num += 1 - if line.startswith("●") or line.startswith("△"): + if (not llm_only and line.startswith("●")) or line.startswith("△"): data = line[1:].strip() - if scn: + if type.is_scn(): ndata = process_scn_line(data) + elif type.is_xml(): + ndata = process_xml_line(data) else: ndata = process_line(data) + if replace_quotes: + ndata = replace_quote_str(ndata) if data != ndata: print(f"{m3t_path}:{line_num}: {data} -> {ndata}") line = line[0] + " " + ndata @@ -95,8 +168,8 @@ def save_lines(m3t_path, lines): f.write(line + "\n") -def process_m3t_file(m3t_path, target_path=None, dry_run=False, scn=False): - lines = process_m3t(m3t_path, scn) +def process_m3t_file(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True): + lines = process_m3t(m3t_path, type, llm_only, replace_quotes) if not dry_run: if target_path is None: save_lines(m3t_path, lines) @@ -105,14 +178,16 @@ def process_m3t_file(m3t_path, target_path=None, dry_run=False, scn=False): save_lines(target_path, lines) -def recursive_process_m3t(m3t_path, target_path=None, dry_run=False, scn=False): +def recursive_process_m3t(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, ext=".txt", replace_quotes=True): for root, dirs, files in os.walk(m3t_path): for file in files: + if not file.lower().endswith(ext): + continue full_path = os.path.join(root, file) output_path = None if target_path is not None: output_path = os.path.join(target_path, file) - process_m3t_file(full_path, output_path, dry_run, scn) + process_m3t_file(full_path, output_path, dry_run, type, llm_only, replace_quotes) if __name__ == "__main__": @@ -121,9 +196,12 @@ if __name__ == "__main__": parser.add_argument("m3t_path", help="Path to M3T file/directory.") parser.add_argument("target_path", nargs="?", default=None, help="Path to save processed M3T file/directory. If not specified, overwrite original files.") parser.add_argument("-d", "--dry-run", action="store_true", help="Only show changes without saving.") - parser.add_argument("--scn", action="store_true", help="Process SCN lines with special rules.") + parser.add_argument("--type", type=ScriptType.from_string, choices=list(ScriptType), help="Process lines with special rules.") + parser.add_argument("--llm-only", action="store_true", help="Only process LLM lines.") + parser.add_argument("--ext", default=".txt", help="File extension to process in directory mode.") + parser.add_argument("--no-replace-quotes", action="store_true", help="Do not replace quotes.") args = parser.parse_args() if os.path.isdir(args.m3t_path): - recursive_process_m3t(args.m3t_path, args.target_path, args.dry_run, args.scn) + recursive_process_m3t(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, args.ext, not args.no_replace_quotes) else: - process_m3t_file(args.m3t_path, args.target_path, args.dry_run, args.scn) + process_m3t_file(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, not args.no_replace_quotes) diff --git a/m3tpostprocess_unittest.py b/m3tpostprocess_unittest.py index f0a2413..9389365 100644 --- a/m3tpostprocess_unittest.py +++ b/m3tpostprocess_unittest.py @@ -1,4 +1,4 @@ -from m3tpostprocess import process_line, process_scn_line +from m3tpostprocess import process_line, process_scn_line, process_xml_line, replace_quote_str def test_process_line(src, expected): @@ -12,6 +12,17 @@ def test_process_scn_line(src, expected): if processed != expected: raise AssertionError(f"Expected: {expected}, but got: {processed}") +def test_process_xml_line(src, expected): + processed = process_xml_line(src) + if processed != expected: + raise AssertionError(f"Expected: {expected}, but got: {processed}") + + +def test_replace_quote_str(src, expected): + processed = replace_quote_str(src) + if processed != expected: + raise AssertionError(f"Expected: {expected}, but got: {processed}") + if __name__ == "__main__": test_process_line("Hello....", "Hello……") @@ -23,5 +34,11 @@ if __name__ == "__main__": test_process_line("!?!?", "?!?!") test_process_line("これはテストです测试。", "测试。") test_process_line("「地形模型。」", "「地形模型」") + test_process_line("(测试。。、)", "(测试)") + test_process_line('"狗屎\'引号\'就是"一坨""', "「狗屎『引号』就是「一坨」」") test_process_scn_line("「地形模型。%command;」", "「地形模型%command;」") test_process_scn_line("%cmd;25%的可能。", "%cmd;25%的可能。") + test_process_xml_line("(测试。。、)", "(测试)") + test_process_xml_line('"狗屎\'引号\'就是"一坨""', '「狗屎『引号』就是「一坨」」') + test_process_xml_line("「地形模型。」", "「地形模型」") + test_replace_quote_str("「狗屎『引号』就是「一坨」」", "「狗屎『引号』就是『一坨』」")