GalScripts/m3tpostprocess.py

import re
import os
import math
from enum import Enum


class ScriptType(str, Enum):
    GENERIC = "generic"
    SCN = "scn"
    XML = "xml"
    CIRCUS = "circus"

    def __str__(self):
        return self.value

    @staticmethod
    def from_string(s: str):
        for stype in ScriptType:
            if stype.value == s.lower():
                return stype
        raise ValueError(f"Unknown ScriptType: {s}")

    def is_scn(self):
        return self == ScriptType.SCN

    def is_xml(self):
        return self == ScriptType.XML

    def is_circus(self):
        return self == ScriptType.CIRCUS


ELLIPSIS = re.compile(r'\.{3,}')
ELLIPSIS_ZH = re.compile(r'…+')
HYPHEN_ZH = re.compile(r'—+')
HALF_FULL_MAP = (
    (',', '，'),
    ('.', '。'),
    ('!', '！'),
    ('?', '？'),
    ('~', '～'),
    ('(', '（'),
    (')', '）'),
    (':', '：'),
    (';', '；'),
    ('-', '—'),
    ('ー', '—'),
    ('&', '＆'),
    ('%', '％'),
    ('・', '·'),
    ('“', '「'),
    ('”', '」'),
    ("‘", '『'),
    ("’", '』'),
)
MIXED_RULE1 = re.compile(r'[，。、]+([…—～]+)')
MIXED_RULE2 = re.compile(r'([…—～]+)[，。、]+')
MIXED_RULE3 = re.compile(r'([…—～])[…—～]+')
MIXED_RULE4 = re.compile(r'([，。、]+)）$')
# 平假名
HIRAGANA = re.compile(r'[\u3040-\u309F]+')
# 片假名
KATAKANA = re.compile(r'[\u30A0-\u30FF]+')
# 狗屎wamsoft在scn里拉的指令
SCN_SPECIAL = re.compile(r'(%(?:[^%;]*;|r)|#[0-9a-fA-F]{6,8};)')
MUL_QUOTE_RULE = re.compile(r'"(.*)"')
SINGLE_QUOTE_RULE = re.compile(r"'(.*)'")
XML_TAG_SPECIAL = re.compile(r'(<[^>]+>)')
CJK_LEFT_QUOTE = ["「", "『"]
CJK_RIGHT_QUOTE = ["」", "』"]
NEED_REPLACED_LEFT_QUOTE = ["“", "‘"]
NEED_REPLACED_RIGHT_QUOTE = ["”", "’"]
QUOTES = (("“", "”"), ("‘", "’"), ("\"", "\""), ("'", "'"), ("「", "」"), ("『", "』"))
DIALOGUES = (
    ("「", "」", QUOTES), ("『", "』", QUOTES), ("（", "）", (("（", "）"), ("(", ")"))),
)
CIRCUS_SPECIAL = re.compile(r'(@[a-zA-Z0-9]+)')


def process_line(data: str) -> str:
    # 使用省略号替换连续的点
    data = ELLIPSIS.sub(lambda m: '…' * math.ceil(len(m.group(0)) / 3), data)
    for half, full in HALF_FULL_MAP:
        data = data.replace(half, full)
    #，。、和…—~混用
    data = MIXED_RULE1.sub(lambda m: m.group(1), data)
    data = MIXED_RULE2.sub(lambda m: m.group(1), data)
    # 混用时…—~只保留一个
    data = MIXED_RULE3.sub(lambda m: m.group(1) * len(m.group(0)), data)
    # 省略号和破折号统一只留一对
    data = ELLIPSIS_ZH.sub('……', data)
    data = HYPHEN_ZH.sub('——', data)
    # 删除所有平假名
    data = HIRAGANA.sub('', data)
    # 删除所有片假名
    data = KATAKANA.sub('', data)
    data = data.replace('！？', '？！')
    data = data.replace('。」', '」')
    # 去除）前多余的标点
    data = MIXED_RULE4.sub('）', data)
    while MUL_QUOTE_RULE.search(data):
        data = MUL_QUOTE_RULE.sub(lambda m: '「' + m.group(1) + '」', data)
    while SINGLE_QUOTE_RULE.search(data):
        data = SINGLE_QUOTE_RULE.sub(lambda m: '『' + m.group(1) + '』', data)
    return data


def process_scn_line(data: str) -> str:
    parts = SCN_SPECIAL.split(data)
    for i in range(len(parts)):
        if i % 2 == 0:
            parts[i] = process_line(parts[i])
            # 特殊处理 。%command;」
            if i > 0 and parts[i].startswith('」'):
                while parts[i - 2].endswith('。'):
                    parts[i - 2] = parts[i - 2][:-1]
    return ''.join(parts)


def process_xml_line(data: str) -> str:
    parts = XML_TAG_SPECIAL.split(data)
    for i in range(len(parts)):
        if i % 2 == 0:
            parts[i] = process_line(parts[i])
            if i > 0 and parts[i].startswith('」'):
                while parts[i - 2].endswith('。'):
                    parts[i - 2] = parts[i - 2][:-1]
    return ''.join(parts)


def process_circus_line(data: str) -> str:
    parts = CIRCUS_SPECIAL.split(data)
    for i in range(len(parts)):
        if i % 2 == 0:
            parts[i] = process_line(parts[i].replace('@', '＠'))
            if i > 0 and parts[i].startswith('」'):
                while parts[i - 2].endswith('。'):
                    parts[i - 2] = parts[i - 2][:-1]
    return ''.join(parts)


def replace_quote_str(s) -> str:
    depth = 0
    re = ""
    for c in s:
        if c in CJK_LEFT_QUOTE:
            re += CJK_LEFT_QUOTE[depth % 2]
            depth += 1
        elif c in CJK_RIGHT_QUOTE:
            depth -= 1
            re += CJK_RIGHT_QUOTE[depth % 2]
        elif c in NEED_REPLACED_LEFT_QUOTE:
            re += CJK_LEFT_QUOTE[depth % 2]
            depth += 1
        elif c in NEED_REPLACED_RIGHT_QUOTE:
            depth -= 1
            re += CJK_RIGHT_QUOTE[depth % 2]
        else:
            re += c
    return re


def check_dialogue(source: str, target: str) -> str:
    for left, right, work in DIALOGUES:
        if source.startswith(left) and source.endswith(right):
            if not target.startswith(left):
                for wleft, _ in work:
                    if target.startswith(wleft):
                        target = left + target[len(wleft):]
                        break
            if not target.endswith(right):
                for _, wright in work:
                    if target.endswith(wright):
                        target = target[:-len(wright)] + right
                        break
            if not target.startswith(left):
                target = left + target
            if not target.endswith(right):
                target = target + right
    return target


def process_m3t(m3t_path, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True):
    lines = []
    line_num = 0
    src = None
    for line in open(m3t_path, "r", encoding="utf-8"):
        line = line.strip()
        line = line.replace('\u200b', '')  # 移除零宽空格
        if line_num == 0:
            line = line.lstrip("\ufeff")  # 移除可能的BOM
        line_num += 1
        if line.startswith("○"):
            if not line.startswith("○ NAME:"):
                src = line[1:].strip()
        if (not llm_only and line.startswith("●")) or line.startswith("△"):
            data = line[1:].strip()
            ndata = data
            if src is not None:
                ndata = check_dialogue(src, ndata)
            if type.is_scn():
                ndata = process_scn_line(ndata)
            elif type.is_xml():
                ndata = process_xml_line(ndata)
            elif type.is_circus():
                ndata = process_circus_line(ndata)
            else:
                ndata = process_line(ndata)
            if replace_quotes:
                ndata = replace_quote_str(ndata)
            if data != ndata:
                print(f"{m3t_path}:{line_num}: {data} -> {ndata}")
                line = line[0] + " " + ndata
        elif not line and src is not None:
            src = None
        lines.append(line)
    return lines


def save_lines(m3t_path, lines):
    with open(m3t_path, "w", encoding="utf-8") as f:
        for line in lines:
            f.write(line + "\n")


def process_m3t_file(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True):
    lines = process_m3t(m3t_path, type, llm_only, replace_quotes)
    if not dry_run:
        if target_path is None:
            save_lines(m3t_path, lines)
        else:
            os.makedirs(os.path.dirname(target_path), exist_ok=True)
            save_lines(target_path, lines)


def recursive_process_m3t(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, ext=".txt", replace_quotes=True):
    for root, dirs, files in os.walk(m3t_path):
        for file in files:
            if not file.lower().endswith(ext):
                continue
            full_path = os.path.join(root, file)
            output_path = None
            if target_path is not None:
                output_path = os.path.join(target_path, file)
            process_m3t_file(full_path, output_path, dry_run, type, llm_only, replace_quotes)


if __name__ == "__main__":
    from argparse import ArgumentParser
    parser = ArgumentParser(description="Post-process M3T files.")
    parser.add_argument("m3t_path", help="Path to M3T file/directory.")
    parser.add_argument("target_path", nargs="?", default=None, help="Path to save processed M3T file/directory. If not specified, overwrite original files.")
    parser.add_argument("-d", "--dry-run", action="store_true", help="Only show changes without saving.")
    parser.add_argument("--type", default=ScriptType.XML, type=ScriptType.from_string, choices=list(ScriptType), help="Process lines with special rules.")
    parser.add_argument("--llm-only", action="store_true", help="Only process LLM lines.")
    parser.add_argument("--ext", default=".txt", help="File extension to process in directory mode.")
    parser.add_argument("--no-replace-quotes", action="store_true", help="Do not replace quotes.")
    args = parser.parse_args()
    if os.path.isdir(args.m3t_path):
        recursive_process_m3t(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, args.ext, not args.no_replace_quotes)
    else:
        process_m3t_file(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, not args.no_replace_quotes)