更新后处理脚本

2026-01-05 10:23:54 +08:00
parent f34f1e8380
commit 209da70424
2 changed files with 107 additions and 12 deletions
--- a/m3tpostprocess.py
+++ b/m3tpostprocess.py
@@ -1,6 +1,29 @@
 import re
 import os
 import math
+from enum import Enum
+
+
+class ScriptType(str, Enum):
+    GENERIC = "generic"
+    SCN = "scn"
+    XML = "xml"
+
+    def __str__(self):
+        return self.value
+
+    @staticmethod
+    def from_string(s: str):
+        for stype in ScriptType:
+            if stype.value == s.lower():
+                return stype
+        raise ValueError(f"Unknown ScriptType: {s}")
+
+    def is_scn(self):
+        return self == ScriptType.SCN
+
+    def is_xml(self):
+        return self == ScriptType.XML


 ELLIPSIS = re.compile(r'\.{3,}')
@@ -24,13 +47,21 @@ HALF_FULL_MAP = (
 )
 MIXED_RULE1 = re.compile(r'[，。、]+([…—～]+)')
 MIXED_RULE2 = re.compile(r'([…—～]+)[，。、]+')
-MIXED_RULE3 = re.compile(f'([…—～])[…—～]+')
+MIXED_RULE3 = re.compile(r'([…—～])[…—～]+')
+MIXED_RULE4 = re.compile(r'([，。、]+)）$')
 # 平假名
 HIRAGANA = re.compile(r'[\u3040-\u309F]+')
 # 片假名
 KATAKANA = re.compile(r'[\u30A0-\u30FF]+')
 # 狗屎wamsoft在scn里拉的指令
 SCN_SPECIAL = re.compile(r'(%(?:[^%;]*;|r)|#[0-9a-fA-F]{6,8};)')
+MUL_QUOTE_RULE = re.compile(r'"(.*)"')
+SINGLE_QUOTE_RULE = re.compile(r"'(.*)'")
+XML_TAG_SPECIAL = re.compile(r'(<[^>]+>)')
+CJK_LEFT_QUOTE = ["「", "『"]
+CJK_RIGHT_QUOTE = ["」", "』"]
+NEED_REPLACED_LEFT_QUOTE = ["“", "‘"]
+NEED_REPLACED_RIGHT_QUOTE = ["”", "’"]


 def process_line(data: str) -> str:
@@ -52,6 +83,12 @@ def process_line(data: str) -> str:
    data = KATAKANA.sub('', data)
    data = data.replace('！？', '？！')
    data = data.replace('。」', '」')
+    # 去除）前多余的标点
+    data = MIXED_RULE4.sub('）', data)
+    while MUL_QUOTE_RULE.search(data):
+        data = MUL_QUOTE_RULE.sub(lambda m: '「' + m.group(1) + '」', data)
+    while SINGLE_QUOTE_RULE.search(data):
+        data = SINGLE_QUOTE_RULE.sub(lambda m: '『' + m.group(1) + '』', data)
    return data


@@ -67,7 +104,39 @@ def process_scn_line(data: str) -> str:
    return ''.join(parts)


-def process_m3t(m3t_path, scn=False):
+def process_xml_line(data: str) -> str:
+    parts = XML_TAG_SPECIAL.split(data)
+    for i in range(len(parts)):
+        if i % 2 == 0:
+            parts[i] = process_line(parts[i])
+            if i > 0 and parts[i].startswith('」'):
+                while parts[i - 2].endswith('。'):
+                    parts[i - 2] = parts[i - 2][:-1]
+    return ''.join(parts)
+
+
+def replace_quote_str(s) -> str:
+    depth = 0
+    re = ""
+    for c in s:
+        if c in CJK_LEFT_QUOTE:
+            re += CJK_LEFT_QUOTE[depth % 2]
+            depth += 1
+        elif c in CJK_RIGHT_QUOTE:
+            depth -= 1
+            re += CJK_RIGHT_QUOTE[depth % 2]
+        elif c in NEED_REPLACED_LEFT_QUOTE:
+            re += CJK_LEFT_QUOTE[depth % 2]
+            depth += 1
+        elif c in NEED_REPLACED_RIGHT_QUOTE:
+            depth -= 1
+            re += CJK_RIGHT_QUOTE[depth % 2]
+        else:
+            re += c
+    return re
+
+
+def process_m3t(m3t_path, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True):
    lines = []
    line_num = 0
    for line in open(m3t_path, "r", encoding="utf-8"):
@@ -76,12 +145,16 @@ def process_m3t(m3t_path, scn=False):
        if line_num == 0:
            line = line.lstrip("\ufeff")  # 移除可能的BOM
        line_num += 1
-        if line.startswith("●") or line.startswith("△"):
+        if (not llm_only and line.startswith("●")) or line.startswith("△"):
            data = line[1:].strip()
-            if scn:
+            if type.is_scn():
                ndata = process_scn_line(data)
+            elif type.is_xml():
+                ndata = process_xml_line(data)
            else:
                ndata = process_line(data)
+            if replace_quotes:
+                ndata = replace_quote_str(ndata)
            if data != ndata:
                print(f"{m3t_path}:{line_num}: {data} -> {ndata}")
                line = line[0] + " " + ndata
@@ -95,8 +168,8 @@ def save_lines(m3t_path, lines):
            f.write(line + "\n")


-def process_m3t_file(m3t_path, target_path=None, dry_run=False, scn=False):
-    lines = process_m3t(m3t_path, scn)
+def process_m3t_file(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True):
+    lines = process_m3t(m3t_path, type, llm_only, replace_quotes)
    if not dry_run:
        if target_path is None:
            save_lines(m3t_path, lines)
@@ -105,14 +178,16 @@ def process_m3t_file(m3t_path, target_path=None, dry_run=False, scn=False):
            save_lines(target_path, lines)


-def recursive_process_m3t(m3t_path, target_path=None, dry_run=False, scn=False):
+def recursive_process_m3t(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, ext=".txt", replace_quotes=True):
    for root, dirs, files in os.walk(m3t_path):
        for file in files:
+            if not file.lower().endswith(ext):
+                continue
            full_path = os.path.join(root, file)
            output_path = None
            if target_path is not None:
                output_path = os.path.join(target_path, file)
-            process_m3t_file(full_path, output_path, dry_run, scn)
+            process_m3t_file(full_path, output_path, dry_run, type, llm_only, replace_quotes)


 if __name__ == "__main__":
@@ -121,9 +196,12 @@ if __name__ == "__main__":
    parser.add_argument("m3t_path", help="Path to M3T file/directory.")
    parser.add_argument("target_path", nargs="?", default=None, help="Path to save processed M3T file/directory. If not specified, overwrite original files.")
    parser.add_argument("-d", "--dry-run", action="store_true", help="Only show changes without saving.")
-    parser.add_argument("--scn", action="store_true", help="Process SCN lines with special rules.")
+    parser.add_argument("--type", type=ScriptType.from_string, choices=list(ScriptType), help="Process lines with special rules.")
+    parser.add_argument("--llm-only", action="store_true", help="Only process LLM lines.")
+    parser.add_argument("--ext", default=".txt", help="File extension to process in directory mode.")
+    parser.add_argument("--no-replace-quotes", action="store_true", help="Do not replace quotes.")
    args = parser.parse_args()
    if os.path.isdir(args.m3t_path):
-        recursive_process_m3t(args.m3t_path, args.target_path, args.dry_run, args.scn)
+        recursive_process_m3t(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, args.ext, not args.no_replace_quotes)
    else:
-        process_m3t_file(args.m3t_path, args.target_path, args.dry_run, args.scn)
+        process_m3t_file(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, not args.no_replace_quotes)
--- a/m3tpostprocess_unittest.py
+++ b/m3tpostprocess_unittest.py
@@ -1,4 +1,4 @@
-from m3tpostprocess import process_line, process_scn_line
+from m3tpostprocess import process_line, process_scn_line, process_xml_line, replace_quote_str


 def test_process_line(src, expected):
@@ -12,6 +12,17 @@ def test_process_scn_line(src, expected):
    if processed != expected:
        raise AssertionError(f"Expected: {expected}, but got: {processed}")

+def test_process_xml_line(src, expected):
+    processed = process_xml_line(src)
+    if processed != expected:
+        raise AssertionError(f"Expected: {expected}, but got: {processed}")
+
+
+def test_replace_quote_str(src, expected):
+    processed = replace_quote_str(src)
+    if processed != expected:
+        raise AssertionError(f"Expected: {expected}, but got: {processed}")
+

 if __name__ == "__main__":
    test_process_line("Hello....", "Hello……")
@@ -23,5 +34,11 @@ if __name__ == "__main__":
    test_process_line("!?!?", "？！？！")
    test_process_line("これはテストです测试。", "测试。")
    test_process_line("「地形模型。」", "「地形模型」")
+    test_process_line("（测试。。、）", "（测试）")
+    test_process_line('"狗屎\'引号\'就是"一坨""', "「狗屎『引号』就是「一坨」」")
    test_process_scn_line("「地形模型。%command;」", "「地形模型%command;」")
    test_process_scn_line("%cmd;25%的可能。", "%cmd;25％的可能。")
+    test_process_xml_line("<tag>（测试。。、）</tag>", "<tag>（测试）</tag>")
+    test_process_xml_line('"狗屎\'引号\'就是"一坨""<tag ok="this is ok">', '「狗屎『引号』就是「一坨」」<tag ok="this is ok">')
+    test_process_xml_line("「地形模型。<command>」", "「地形模型<command>」")
+    test_replace_quote_str("「狗屎『引号』就是「一坨」」", "「狗屎『引号』就是『一坨』」")