更新后处理脚本
This commit is contained in:
@@ -1,6 +1,29 @@
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import math
|
import math
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class ScriptType(str, Enum):
|
||||||
|
GENERIC = "generic"
|
||||||
|
SCN = "scn"
|
||||||
|
XML = "xml"
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.value
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_string(s: str):
|
||||||
|
for stype in ScriptType:
|
||||||
|
if stype.value == s.lower():
|
||||||
|
return stype
|
||||||
|
raise ValueError(f"Unknown ScriptType: {s}")
|
||||||
|
|
||||||
|
def is_scn(self):
|
||||||
|
return self == ScriptType.SCN
|
||||||
|
|
||||||
|
def is_xml(self):
|
||||||
|
return self == ScriptType.XML
|
||||||
|
|
||||||
|
|
||||||
ELLIPSIS = re.compile(r'\.{3,}')
|
ELLIPSIS = re.compile(r'\.{3,}')
|
||||||
@@ -24,13 +47,21 @@ HALF_FULL_MAP = (
|
|||||||
)
|
)
|
||||||
MIXED_RULE1 = re.compile(r'[,。、]+([…—~]+)')
|
MIXED_RULE1 = re.compile(r'[,。、]+([…—~]+)')
|
||||||
MIXED_RULE2 = re.compile(r'([…—~]+)[,。、]+')
|
MIXED_RULE2 = re.compile(r'([…—~]+)[,。、]+')
|
||||||
MIXED_RULE3 = re.compile(f'([…—~])[…—~]+')
|
MIXED_RULE3 = re.compile(r'([…—~])[…—~]+')
|
||||||
|
MIXED_RULE4 = re.compile(r'([,。、]+))$')
|
||||||
# 平假名
|
# 平假名
|
||||||
HIRAGANA = re.compile(r'[\u3040-\u309F]+')
|
HIRAGANA = re.compile(r'[\u3040-\u309F]+')
|
||||||
# 片假名
|
# 片假名
|
||||||
KATAKANA = re.compile(r'[\u30A0-\u30FF]+')
|
KATAKANA = re.compile(r'[\u30A0-\u30FF]+')
|
||||||
# 狗屎wamsoft在scn里拉的指令
|
# 狗屎wamsoft在scn里拉的指令
|
||||||
SCN_SPECIAL = re.compile(r'(%(?:[^%;]*;|r)|#[0-9a-fA-F]{6,8};)')
|
SCN_SPECIAL = re.compile(r'(%(?:[^%;]*;|r)|#[0-9a-fA-F]{6,8};)')
|
||||||
|
MUL_QUOTE_RULE = re.compile(r'"(.*)"')
|
||||||
|
SINGLE_QUOTE_RULE = re.compile(r"'(.*)'")
|
||||||
|
XML_TAG_SPECIAL = re.compile(r'(<[^>]+>)')
|
||||||
|
CJK_LEFT_QUOTE = ["「", "『"]
|
||||||
|
CJK_RIGHT_QUOTE = ["」", "』"]
|
||||||
|
NEED_REPLACED_LEFT_QUOTE = ["“", "‘"]
|
||||||
|
NEED_REPLACED_RIGHT_QUOTE = ["”", "’"]
|
||||||
|
|
||||||
|
|
||||||
def process_line(data: str) -> str:
|
def process_line(data: str) -> str:
|
||||||
@@ -52,6 +83,12 @@ def process_line(data: str) -> str:
|
|||||||
data = KATAKANA.sub('', data)
|
data = KATAKANA.sub('', data)
|
||||||
data = data.replace('!?', '?!')
|
data = data.replace('!?', '?!')
|
||||||
data = data.replace('。」', '」')
|
data = data.replace('。」', '」')
|
||||||
|
# 去除)前多余的标点
|
||||||
|
data = MIXED_RULE4.sub(')', data)
|
||||||
|
while MUL_QUOTE_RULE.search(data):
|
||||||
|
data = MUL_QUOTE_RULE.sub(lambda m: '「' + m.group(1) + '」', data)
|
||||||
|
while SINGLE_QUOTE_RULE.search(data):
|
||||||
|
data = SINGLE_QUOTE_RULE.sub(lambda m: '『' + m.group(1) + '』', data)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
@@ -67,7 +104,39 @@ def process_scn_line(data: str) -> str:
|
|||||||
return ''.join(parts)
|
return ''.join(parts)
|
||||||
|
|
||||||
|
|
||||||
def process_m3t(m3t_path, scn=False):
|
def process_xml_line(data: str) -> str:
|
||||||
|
parts = XML_TAG_SPECIAL.split(data)
|
||||||
|
for i in range(len(parts)):
|
||||||
|
if i % 2 == 0:
|
||||||
|
parts[i] = process_line(parts[i])
|
||||||
|
if i > 0 and parts[i].startswith('」'):
|
||||||
|
while parts[i - 2].endswith('。'):
|
||||||
|
parts[i - 2] = parts[i - 2][:-1]
|
||||||
|
return ''.join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_quote_str(s) -> str:
|
||||||
|
depth = 0
|
||||||
|
re = ""
|
||||||
|
for c in s:
|
||||||
|
if c in CJK_LEFT_QUOTE:
|
||||||
|
re += CJK_LEFT_QUOTE[depth % 2]
|
||||||
|
depth += 1
|
||||||
|
elif c in CJK_RIGHT_QUOTE:
|
||||||
|
depth -= 1
|
||||||
|
re += CJK_RIGHT_QUOTE[depth % 2]
|
||||||
|
elif c in NEED_REPLACED_LEFT_QUOTE:
|
||||||
|
re += CJK_LEFT_QUOTE[depth % 2]
|
||||||
|
depth += 1
|
||||||
|
elif c in NEED_REPLACED_RIGHT_QUOTE:
|
||||||
|
depth -= 1
|
||||||
|
re += CJK_RIGHT_QUOTE[depth % 2]
|
||||||
|
else:
|
||||||
|
re += c
|
||||||
|
return re
|
||||||
|
|
||||||
|
|
||||||
|
def process_m3t(m3t_path, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True):
|
||||||
lines = []
|
lines = []
|
||||||
line_num = 0
|
line_num = 0
|
||||||
for line in open(m3t_path, "r", encoding="utf-8"):
|
for line in open(m3t_path, "r", encoding="utf-8"):
|
||||||
@@ -76,12 +145,16 @@ def process_m3t(m3t_path, scn=False):
|
|||||||
if line_num == 0:
|
if line_num == 0:
|
||||||
line = line.lstrip("\ufeff") # 移除可能的BOM
|
line = line.lstrip("\ufeff") # 移除可能的BOM
|
||||||
line_num += 1
|
line_num += 1
|
||||||
if line.startswith("●") or line.startswith("△"):
|
if (not llm_only and line.startswith("●")) or line.startswith("△"):
|
||||||
data = line[1:].strip()
|
data = line[1:].strip()
|
||||||
if scn:
|
if type.is_scn():
|
||||||
ndata = process_scn_line(data)
|
ndata = process_scn_line(data)
|
||||||
|
elif type.is_xml():
|
||||||
|
ndata = process_xml_line(data)
|
||||||
else:
|
else:
|
||||||
ndata = process_line(data)
|
ndata = process_line(data)
|
||||||
|
if replace_quotes:
|
||||||
|
ndata = replace_quote_str(ndata)
|
||||||
if data != ndata:
|
if data != ndata:
|
||||||
print(f"{m3t_path}:{line_num}: {data} -> {ndata}")
|
print(f"{m3t_path}:{line_num}: {data} -> {ndata}")
|
||||||
line = line[0] + " " + ndata
|
line = line[0] + " " + ndata
|
||||||
@@ -95,8 +168,8 @@ def save_lines(m3t_path, lines):
|
|||||||
f.write(line + "\n")
|
f.write(line + "\n")
|
||||||
|
|
||||||
|
|
||||||
def process_m3t_file(m3t_path, target_path=None, dry_run=False, scn=False):
|
def process_m3t_file(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, replace_quotes=True):
|
||||||
lines = process_m3t(m3t_path, scn)
|
lines = process_m3t(m3t_path, type, llm_only, replace_quotes)
|
||||||
if not dry_run:
|
if not dry_run:
|
||||||
if target_path is None:
|
if target_path is None:
|
||||||
save_lines(m3t_path, lines)
|
save_lines(m3t_path, lines)
|
||||||
@@ -105,14 +178,16 @@ def process_m3t_file(m3t_path, target_path=None, dry_run=False, scn=False):
|
|||||||
save_lines(target_path, lines)
|
save_lines(target_path, lines)
|
||||||
|
|
||||||
|
|
||||||
def recursive_process_m3t(m3t_path, target_path=None, dry_run=False, scn=False):
|
def recursive_process_m3t(m3t_path, target_path=None, dry_run=False, type=ScriptType.GENERIC, llm_only=False, ext=".txt", replace_quotes=True):
|
||||||
for root, dirs, files in os.walk(m3t_path):
|
for root, dirs, files in os.walk(m3t_path):
|
||||||
for file in files:
|
for file in files:
|
||||||
|
if not file.lower().endswith(ext):
|
||||||
|
continue
|
||||||
full_path = os.path.join(root, file)
|
full_path = os.path.join(root, file)
|
||||||
output_path = None
|
output_path = None
|
||||||
if target_path is not None:
|
if target_path is not None:
|
||||||
output_path = os.path.join(target_path, file)
|
output_path = os.path.join(target_path, file)
|
||||||
process_m3t_file(full_path, output_path, dry_run, scn)
|
process_m3t_file(full_path, output_path, dry_run, type, llm_only, replace_quotes)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@@ -121,9 +196,12 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("m3t_path", help="Path to M3T file/directory.")
|
parser.add_argument("m3t_path", help="Path to M3T file/directory.")
|
||||||
parser.add_argument("target_path", nargs="?", default=None, help="Path to save processed M3T file/directory. If not specified, overwrite original files.")
|
parser.add_argument("target_path", nargs="?", default=None, help="Path to save processed M3T file/directory. If not specified, overwrite original files.")
|
||||||
parser.add_argument("-d", "--dry-run", action="store_true", help="Only show changes without saving.")
|
parser.add_argument("-d", "--dry-run", action="store_true", help="Only show changes without saving.")
|
||||||
parser.add_argument("--scn", action="store_true", help="Process SCN lines with special rules.")
|
parser.add_argument("--type", type=ScriptType.from_string, choices=list(ScriptType), help="Process lines with special rules.")
|
||||||
|
parser.add_argument("--llm-only", action="store_true", help="Only process LLM lines.")
|
||||||
|
parser.add_argument("--ext", default=".txt", help="File extension to process in directory mode.")
|
||||||
|
parser.add_argument("--no-replace-quotes", action="store_true", help="Do not replace quotes.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if os.path.isdir(args.m3t_path):
|
if os.path.isdir(args.m3t_path):
|
||||||
recursive_process_m3t(args.m3t_path, args.target_path, args.dry_run, args.scn)
|
recursive_process_m3t(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, args.ext, not args.no_replace_quotes)
|
||||||
else:
|
else:
|
||||||
process_m3t_file(args.m3t_path, args.target_path, args.dry_run, args.scn)
|
process_m3t_file(args.m3t_path, args.target_path, args.dry_run, args.type, args.llm_only, not args.no_replace_quotes)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from m3tpostprocess import process_line, process_scn_line
|
from m3tpostprocess import process_line, process_scn_line, process_xml_line, replace_quote_str
|
||||||
|
|
||||||
|
|
||||||
def test_process_line(src, expected):
|
def test_process_line(src, expected):
|
||||||
@@ -12,6 +12,17 @@ def test_process_scn_line(src, expected):
|
|||||||
if processed != expected:
|
if processed != expected:
|
||||||
raise AssertionError(f"Expected: {expected}, but got: {processed}")
|
raise AssertionError(f"Expected: {expected}, but got: {processed}")
|
||||||
|
|
||||||
|
def test_process_xml_line(src, expected):
|
||||||
|
processed = process_xml_line(src)
|
||||||
|
if processed != expected:
|
||||||
|
raise AssertionError(f"Expected: {expected}, but got: {processed}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_replace_quote_str(src, expected):
|
||||||
|
processed = replace_quote_str(src)
|
||||||
|
if processed != expected:
|
||||||
|
raise AssertionError(f"Expected: {expected}, but got: {processed}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test_process_line("Hello....", "Hello……")
|
test_process_line("Hello....", "Hello……")
|
||||||
@@ -23,5 +34,11 @@ if __name__ == "__main__":
|
|||||||
test_process_line("!?!?", "?!?!")
|
test_process_line("!?!?", "?!?!")
|
||||||
test_process_line("これはテストです测试。", "测试。")
|
test_process_line("これはテストです测试。", "测试。")
|
||||||
test_process_line("「地形模型。」", "「地形模型」")
|
test_process_line("「地形模型。」", "「地形模型」")
|
||||||
|
test_process_line("(测试。。、)", "(测试)")
|
||||||
|
test_process_line('"狗屎\'引号\'就是"一坨""', "「狗屎『引号』就是「一坨」」")
|
||||||
test_process_scn_line("「地形模型。%command;」", "「地形模型%command;」")
|
test_process_scn_line("「地形模型。%command;」", "「地形模型%command;」")
|
||||||
test_process_scn_line("%cmd;25%的可能。", "%cmd;25%的可能。")
|
test_process_scn_line("%cmd;25%的可能。", "%cmd;25%的可能。")
|
||||||
|
test_process_xml_line("<tag>(测试。。、)</tag>", "<tag>(测试)</tag>")
|
||||||
|
test_process_xml_line('"狗屎\'引号\'就是"一坨""<tag ok="this is ok">', '「狗屎『引号』就是「一坨」」<tag ok="this is ok">')
|
||||||
|
test_process_xml_line("「地形模型。<command>」", "「地形模型<command>」")
|
||||||
|
test_replace_quote_str("「狗屎『引号』就是「一坨」」", "「狗屎『引号』就是『一坨』」")
|
||||||
|
|||||||
Reference in New Issue
Block a user