Add more check and fix

2026-01-05 15:51:09 +08:00
parent bc4d83cef6
commit 64b4fb9fb8
2 changed files with 52 additions and 4 deletions
--- a/m3tpostprocess.py
+++ b/m3tpostprocess.py
@@ -8,6 +8,7 @@ class ScriptType(str, Enum):
    GENERIC = "generic"
    SCN = "scn"
    XML = "xml"
+    CIRCUS = "circus"

    def __str__(self):
        return self.value
@@ -25,6 +26,9 @@ class ScriptType(str, Enum):
    def is_xml(self):
        return self == ScriptType.XML

+    def is_circus(self):
+        return self == ScriptType.CIRCUS
+

 ELLIPSIS = re.compile(r'\.{3,}')
 ELLIPSIS_ZH = re.compile(r'…+')
@@ -66,6 +70,10 @@ CJK_LEFT_QUOTE = ["「", "『"]
 CJK_RIGHT_QUOTE = ["」", "』"]
 NEED_REPLACED_LEFT_QUOTE = ["“", "‘"]
 NEED_REPLACED_RIGHT_QUOTE = ["”", "’"]
+DIALOGUES = (
+    ("「", "」"), ("『", "』"), ("（", "）"),
+)
+CIRCUS_SPECIAL = re.compile(r'(@[a-zA-Z0-9]+)')


 def process_line(data: str) -> str:
@@ -119,6 +127,17 @@ def process_xml_line(data: str) -> str:
    return ''.join(parts)


+def process_circus_line(data: str) -> str:
+    parts = CIRCUS_SPECIAL.split(data)
+    for i in range(len(parts)):
+        if i % 2 == 0:
+            parts[i] = process_line(parts[i].replace('@', '＠'))
+            if i > 0 and parts[i].startswith('」'):
+                while parts[i - 2].endswith('。'):
+                    parts[i - 2] = parts[i - 2][:-1]
+    return ''.join(parts)
+
+
 def replace_quote_str(s) -> str:
    depth = 0
    re = ""
@@ -140,28 +159,49 @@ def replace_quote_str(s) -> str:
    return re


+def check_dialogue(source: str, target: str) -> str:
+    for left, right in DIALOGUES:
+        if source.startswith(left) and source.endswith(right):
+            if not target.startswith(left):
+                target = left + target
+            if not target.endswith(right):
+                target = target + right
+    return target
+
+
 def process_m3t(m3t_path, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True):
    lines = []
    line_num = 0
+    src = None
    for line in open(m3t_path, "r", encoding="utf-8"):
        line = line.strip()
        line = line.replace('\u200b', '')  # 移除零宽空格
        if line_num == 0:
            line = line.lstrip("\ufeff")  # 移除可能的BOM
        line_num += 1
+        if line.startswith("○"):
+            if not line.startswith("○ NAME:"):
+                src = line[1:].strip()
        if (not llm_only and line.startswith("●")) or line.startswith("△"):
            data = line[1:].strip()
+            ndata = data
+            if src is not None:
+                ndata = check_dialogue(src, ndata)
            if type.is_scn():
-                ndata = process_scn_line(data)
+                ndata = process_scn_line(ndata)
            elif type.is_xml():
-                ndata = process_xml_line(data)
+                ndata = process_xml_line(ndata)
+            elif type.is_circus():
+                ndata = process_circus_line(ndata)
            else:
-                ndata = process_line(data)
+                ndata = process_line(ndata)
            if replace_quotes:
                ndata = replace_quote_str(ndata)
            if data != ndata:
                print(f"{m3t_path}:{line_num}: {data} -> {ndata}")
                line = line[0] + " " + ndata
+        elif not line and src is not None:
+            src = None
        lines.append(line)
    return lines

--- a/m3tpostprocess_unittest.py
+++ b/m3tpostprocess_unittest.py
@@ -1,4 +1,4 @@
-from m3tpostprocess import process_line, process_scn_line, process_xml_line, replace_quote_str
+from m3tpostprocess import process_line, process_scn_line, process_xml_line, replace_quote_str, process_circus_line


 def test_process_line(src, expected):
@@ -24,6 +24,12 @@ def test_replace_quote_str(src, expected):
        raise AssertionError(f"Expected: {expected}, but got: {processed}")


+def test_process_circus_line(src, expected):
+    processed = process_circus_line(src)
+    if processed != expected:
+        raise AssertionError(f"Expected: {expected}, but got: {processed}")
+
+
 if __name__ == "__main__":
    test_process_line("Hello....", "Hello……")
    test_process_line("……。", "……")
@@ -42,3 +48,5 @@ if __name__ == "__main__":
    test_process_xml_line('"狗屎\'引号\'就是"一坨""<tag ok="this is ok">', '「狗屎『引号』就是「一坨」」<tag ok="this is ok">')
    test_process_xml_line("「地形模型。<command>」", "「地形模型<command>」")
    test_replace_quote_str("「狗屎『引号』就是「一坨」」", "「狗屎『引号』就是『一坨』」")
+    test_process_circus_line("这是一个@special标记的测试。", "这是一个@special标记的测试。")
+    test_process_circus_line("其他@测@123试", "其他＠测@123试")