import re import os import math from enum import Enum class ScriptType(str, Enum): GENERIC = "generic" SCN = "scn" XML = "xml" CIRCUS = "circus" def __str__(self): return self.value @staticmethod def from_string(s: str): for stype in ScriptType: if stype.value == s.lower(): return stype raise ValueError(f"Unknown ScriptType: {s}") def is_scn(self): return self == ScriptType.SCN def is_xml(self): return self == ScriptType.XML def is_circus(self): return self == ScriptType.CIRCUS ELLIPSIS = re.compile(r'\.{3,}') ELLIPSIS_ZH = re.compile(r'…+') HYPHEN_ZH = re.compile(r'—+') HALF_FULL_MAP = ( (',', ','), ('.', '。'), ('!', '!'), ('?', '?'), ('~', '~'), ('(', '('), (')', ')'), (':', ':'), (';', ';'), ('-', '—'), ('ー', '—'), ('&', '&'), ('%', '%'), ('・', '·'), ('“', '「'), ('”', '」'), ("‘", '『'), ("’", '』'), ) MIXED_RULE1 = re.compile(r'[,。、]+([…—~]+)') MIXED_RULE2 = re.compile(r'([…—~]+)[,。、]+') MIXED_RULE3 = re.compile(r'([…—~])[…—~]+') MIXED_RULE4 = re.compile(r'([,。、]+))$') # 平假名 HIRAGANA = re.compile(r'[\u3040-\u309F]+') # 片假名 KATAKANA = re.compile(r'[\u30A0-\u30FF]+') # 狗屎wamsoft在scn里拉的指令 SCN_SPECIAL = re.compile(r'(%(?:[^%;]*;|r)|#[0-9a-fA-F]{6,8};)') MUL_QUOTE_RULE = re.compile(r'"(.*)"') SINGLE_QUOTE_RULE = re.compile(r"'(.*)'") XML_TAG_SPECIAL = re.compile(r'(<[^>]+>)') CJK_LEFT_QUOTE = ["「", "『"] CJK_RIGHT_QUOTE = ["」", "』"] NEED_REPLACED_LEFT_QUOTE = ["“", "‘"] NEED_REPLACED_RIGHT_QUOTE = ["”", "’"] DIALOGUES = ( ("「", "」"), ("『", "』"), ("(", ")"), ) CIRCUS_SPECIAL = re.compile(r'(@[a-zA-Z0-9]+)') def process_line(data: str) -> str: # 使用省略号替换连续的点 data = ELLIPSIS.sub(lambda m: '…' * math.ceil(len(m.group(0)) / 3), data) for half, full in HALF_FULL_MAP: data = data.replace(half, full) #,。、和…—~混用 data = MIXED_RULE1.sub(lambda m: m.group(1), data) data = MIXED_RULE2.sub(lambda m: m.group(1), data) # 混用时…—~只保留一个 data = MIXED_RULE3.sub(lambda m: m.group(1) * len(m.group(0)), data) # 省略号和破折号统一只留一对 data = ELLIPSIS_ZH.sub('……', data) data = HYPHEN_ZH.sub('——', data) # 删除所有平假名 data = HIRAGANA.sub('', data) # 删除所有片假名 data = KATAKANA.sub('', data) data = data.replace('!?', '?!') data = data.replace('。」', '」') # 去除)前多余的标点 data = MIXED_RULE4.sub(')', data) while MUL_QUOTE_RULE.search(data): data = MUL_QUOTE_RULE.sub(lambda m: '「' + m.group(1) + '」', data) while SINGLE_QUOTE_RULE.search(data): data = SINGLE_QUOTE_RULE.sub(lambda m: '『' + m.group(1) + '』', data) return data def process_scn_line(data: str) -> str: parts = SCN_SPECIAL.split(data) for i in range(len(parts)): if i % 2 == 0: parts[i] = process_line(parts[i]) # 特殊处理 。%command;」 if i > 0 and parts[i].startswith('」'): while parts[i - 2].endswith('。'): parts[i - 2] = parts[i - 2][:-1] return ''.join(parts) def process_xml_line(data: str) -> str: parts = XML_TAG_SPECIAL.split(data) for i in range(len(parts)): if i % 2 == 0: parts[i] = process_line(parts[i]) if i > 0 and parts[i].startswith('」'): while parts[i - 2].endswith('。'): parts[i - 2] = parts[i - 2][:-1] return ''.join(parts) def process_circus_line(data: str) -> str: parts = CIRCUS_SPECIAL.split(data) for i in range(len(parts)): if i % 2 == 0: parts[i] = process_line(parts[i].replace('@', '@')) if i > 0 and parts[i].startswith('」'): while parts[i - 2].endswith('。'): parts[i - 2] = parts[i - 2][:-1] return ''.join(parts) def replace_quote_str(s) -> str: depth = 0 re = "" for c in s: if c in CJK_LEFT_QUOTE: re += CJK_LEFT_QUOTE[depth % 2] depth += 1 elif c in CJK_RIGHT_QUOTE: depth -= 1 re += CJK_RIGHT_QUOTE[depth % 2] elif c in NEED_REPLACED_LEFT_QUOTE: re += CJK_LEFT_QUOTE[depth % 2] depth += 1 elif c in NEED_REPLACED_RIGHT_QUOTE: depth -= 1 re += CJK_RIGHT_QUOTE[depth % 2] else: re += c return re def check_dialogue(source: str, target: str) -> str: for left, right in DIALOGUES: if source.startswith(left) and source.endswith(right): if not target.startswith(left): target = left + target if not target.endswith(right): target = target + right return target def process_m3t(m3t_path, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True): lines = [] line_num = 0 src = None for line in open(m3t_path, "r", encoding="utf-8"): line = line.strip() line = line.replace('\u200b', '') # 移除零宽空格 if line_num == 0: line = line.lstrip("\ufeff") # 移除可能的BOM line_num += 1 if line.startswith("○"): if not line.startswith("○ NAME:"): src = line[1:].strip() if (not llm_only and line.startswith("●")) or line.startswith("△"): data = line[1:].strip() ndata = data if src is not None: ndata = check_dialogue(src, ndata) if type.is_scn(): ndata = process_scn_line(ndata) elif type.is_xml(): ndata = process_xml_line(ndata) elif type.is_circus(): ndata = process_circus_line(ndata) else: ndata = process_line(ndata) if replace_quotes: ndata = replace_quote_str(ndata) if data != ndata: print(f"{m3t_path}:{line_num}: {data} -> {ndata}") line = line[0] + " " + ndata elif not line and src is not None: src = None lines.append(line) return lines def save_lines(m3t_path, lines): with open(m3t_path, "w", encoding="utf-8") as f: for line in lines: f.write(line + "\n") def process_m3t_file(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True): lines = process_m3t(m3t_path, type, llm_only, replace_quotes) if not dry_run: if target_path is None: save_lines(m3t_path, lines) else: os.makedirs(os.path.dirname(target_path), exist_ok=True) save_lines(target_path, lines) def recursive_process_m3t(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, ext=".txt", replace_quotes=True): for root, dirs, files in os.walk(m3t_path): for file in files: if not file.lower().endswith(ext): continue full_path = os.path.join(root, file) output_path = None if target_path is not None: output_path = os.path.join(target_path, file) process_m3t_file(full_path, output_path, dry_run, type, llm_only, replace_quotes) if __name__ == "__main__": from argparse import ArgumentParser parser = ArgumentParser(description="Post-process M3T files.") parser.add_argument("m3t_path", help="Path to M3T file/directory.") parser.add_argument("target_path", nargs="?", default=None, help="Path to save processed M3T file/directory. If not specified, overwrite original files.") parser.add_argument("-d", "--dry-run", action="store_true", help="Only show changes without saving.") parser.add_argument("--type", default=ScriptType.XML, type=ScriptType.from_string, choices=list(ScriptType), help="Process lines with special rules.") parser.add_argument("--llm-only", action="store_true", help="Only process LLM lines.") parser.add_argument("--ext", default=".txt", help="File extension to process in directory mode.") parser.add_argument("--no-replace-quotes", action="store_true", help="Do not replace quotes.") args = parser.parse_args() if os.path.isdir(args.m3t_path): recursive_process_m3t(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, args.ext, not args.no_replace_quotes) else: process_m3t_file(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, not args.no_replace_quotes)