Add iinkai extract script
This commit is contained in:
310
KAGParser.py
Normal file
310
KAGParser.py
Normal file
@@ -0,0 +1,310 @@
|
|||||||
|
# SPDX-License-Identifier: LicenseRef-Proprietary
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Dict, List, Union
|
||||||
|
|
||||||
|
# --- Node Definitions ---
|
||||||
|
# We use dataclasses to represent the different types of parsed elements.
|
||||||
|
|
||||||
|
|
||||||
|
class INode(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def serialize(self) -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CommentNode(INode):
|
||||||
|
text: str
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"Comment('{self.text}')"
|
||||||
|
|
||||||
|
def serialize(self) -> str:
|
||||||
|
return f"; {self.text}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LabelNode(INode):
|
||||||
|
name: str
|
||||||
|
page: str = ""
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"Label(name='{self.name}', page='{self.page}')"
|
||||||
|
|
||||||
|
def serialize(self) -> str:
|
||||||
|
if self.page:
|
||||||
|
return f"*{self.name}|{self.page}"
|
||||||
|
return f"*{self.name}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TextNode(INode):
|
||||||
|
text: str
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"Text('{self.text}')"
|
||||||
|
|
||||||
|
def serialize(self) -> str:
|
||||||
|
# In KAG, a literal '[' is escaped as '[['.
|
||||||
|
return self.text.replace("[", "[[")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EmptyLineNode(INode):
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"EmptyLine"
|
||||||
|
|
||||||
|
def serialize(self) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TagNode(INode):
|
||||||
|
name: str
|
||||||
|
attributes: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"Tag(name='{self.name}', attributes={self.attributes})"
|
||||||
|
|
||||||
|
def _serialize_attributes(self) -> str:
|
||||||
|
"""Helper to convert the attribute dictionary to a string."""
|
||||||
|
parts = []
|
||||||
|
for key, value in self.attributes.items():
|
||||||
|
if value is True:
|
||||||
|
parts.append(key)
|
||||||
|
else:
|
||||||
|
val_str = str(value)
|
||||||
|
# Quote the value if it contains spaces to ensure it's parsed correctly.
|
||||||
|
if " " in val_str or "=" in val_str:
|
||||||
|
parts.append(f'{key}="{val_str}"')
|
||||||
|
else:
|
||||||
|
parts.append(f"{key}={val_str}")
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
def serialize(self) -> str:
|
||||||
|
attr_str = self._serialize_attributes()
|
||||||
|
if attr_str:
|
||||||
|
return f"[{self.name} {attr_str}]"
|
||||||
|
return f"[{self.name}]"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CommandNode(TagNode): # A command is just a tag with a different syntax
|
||||||
|
def __repr__(self):
|
||||||
|
return f"Command(name='{self.name}', attributes={self.attributes})"
|
||||||
|
|
||||||
|
def serialize(self) -> str:
|
||||||
|
attr_str = self._serialize_attributes()
|
||||||
|
if attr_str:
|
||||||
|
return f"@{self.name} {attr_str}"
|
||||||
|
return f"@{self.name}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ScriptBlockNode(INode):
|
||||||
|
script: str
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"ScriptBlock(script='{self.script[:30]}...')"
|
||||||
|
|
||||||
|
def serialize(self) -> str:
|
||||||
|
return f"[iscript]\n{self.script}\n[endscript]"
|
||||||
|
|
||||||
|
|
||||||
|
# A line can contain a mix of text and tags
|
||||||
|
ParsedLine = List[Union[TextNode, TagNode]]
|
||||||
|
|
||||||
|
# The final parsed script is a list of different node types
|
||||||
|
ParsedScript = List[
|
||||||
|
Union[CommentNode, LabelNode, CommandNode, ScriptBlockNode, ParsedLine]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class KAGScriptParser:
|
||||||
|
"""
|
||||||
|
Parses a KAG (.ks) script file into a structured list of nodes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Regex to split a line into text and tags. It keeps the delimiters (the tags).
|
||||||
|
_LINE_SPLIT_RE = re.compile(r"(\[.*?\])")
|
||||||
|
|
||||||
|
# Regex to parse attributes within a tag/command string.
|
||||||
|
# It handles: key=value, key="value", key='value', and boolean keys.
|
||||||
|
_ATTR_RE = re.compile(
|
||||||
|
r"""
|
||||||
|
([a-zA-Z0-9_]+) # Attribute key
|
||||||
|
(?:
|
||||||
|
= # Equals sign
|
||||||
|
(
|
||||||
|
"[^"]*" | # Double-quoted value
|
||||||
|
'[^']*' | # Single-quoted value
|
||||||
|
[^\s\]]+ # Unquoted value
|
||||||
|
)
|
||||||
|
)? # The entire value part is optional
|
||||||
|
""",
|
||||||
|
re.VERBOSE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self, script_text: str):
|
||||||
|
self.lines = script_text.splitlines()
|
||||||
|
self.parsed_script: ParsedScript = []
|
||||||
|
|
||||||
|
def _parse_attributes(self, attr_string: str) -> Dict[str, Any]:
|
||||||
|
"""Parses the attribute string of a tag or command."""
|
||||||
|
attributes = {}
|
||||||
|
for match in self._ATTR_RE.finditer(attr_string):
|
||||||
|
key = match.group(1)
|
||||||
|
value = match.group(2)
|
||||||
|
|
||||||
|
if value is None:
|
||||||
|
# Boolean attribute, like [p clickable]
|
||||||
|
attributes[key] = True
|
||||||
|
else:
|
||||||
|
# Un-quote if necessary
|
||||||
|
if value.startswith('"') and value.endswith('"'):
|
||||||
|
value = value[1:-1]
|
||||||
|
elif value.startswith("'") and value.endswith("'"):
|
||||||
|
value = value[1:-1]
|
||||||
|
|
||||||
|
# As per C++ code, unescape the ` character
|
||||||
|
value = value.replace("`", "")
|
||||||
|
attributes[key] = value
|
||||||
|
return attributes
|
||||||
|
|
||||||
|
def _parse_tag_or_command(
|
||||||
|
self, content: str, is_command: bool = False
|
||||||
|
) -> Union[TagNode, CommandNode]:
|
||||||
|
"""Parses a tag [name attr=val] or command @name attr=val."""
|
||||||
|
parts = content.strip().split(maxsplit=1)
|
||||||
|
tag_name = parts[0]
|
||||||
|
attr_string = parts[1] if len(parts) > 1 else ""
|
||||||
|
|
||||||
|
attributes = self._parse_attributes(attr_string)
|
||||||
|
|
||||||
|
NodeClass = CommandNode if is_command else TagNode
|
||||||
|
return NodeClass(name=tag_name, attributes=attributes)
|
||||||
|
|
||||||
|
def parse(self, perserve_empty_lines=False) -> ParsedScript:
|
||||||
|
"""
|
||||||
|
Executes the parsing process on the entire script.
|
||||||
|
Returns a list of parsed nodes.
|
||||||
|
"""
|
||||||
|
self.parsed_script = []
|
||||||
|
in_script_block = False
|
||||||
|
script_buffer = []
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < len(self.lines):
|
||||||
|
line = self.lines[i].strip()
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if not line:
|
||||||
|
if perserve_empty_lines:
|
||||||
|
self.parsed_script.append(EmptyLineNode())
|
||||||
|
else:
|
||||||
|
continue # Skip empty lines
|
||||||
|
|
||||||
|
# Handle script blocks [iscript]...[endscript]
|
||||||
|
if in_script_block:
|
||||||
|
if line == "[endscript]":
|
||||||
|
in_script_block = False
|
||||||
|
self.parsed_script.append(ScriptBlockNode("\n".join(script_buffer)))
|
||||||
|
script_buffer = []
|
||||||
|
else:
|
||||||
|
script_buffer.append(line)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if line == "[iscript]":
|
||||||
|
in_script_block = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle comments
|
||||||
|
if line.startswith(";"):
|
||||||
|
self.parsed_script.append(CommentNode(line[1:].lstrip()))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle labels
|
||||||
|
if line.startswith("*"):
|
||||||
|
label_part = line[1:]
|
||||||
|
if "|" in label_part:
|
||||||
|
name, page = label_part.split("|", 1)
|
||||||
|
self.parsed_script.append(LabelNode(name, page))
|
||||||
|
else:
|
||||||
|
self.parsed_script.append(LabelNode(label_part))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle commands
|
||||||
|
if line.startswith("@"):
|
||||||
|
self.parsed_script.append(
|
||||||
|
self._parse_tag_or_command(line[1:], is_command=True)
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle line continuation
|
||||||
|
full_line = line
|
||||||
|
while full_line.endswith("\\"):
|
||||||
|
full_line = full_line[:-1].rstrip()
|
||||||
|
if i < len(self.lines):
|
||||||
|
full_line += " " + self.lines[i].strip()
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Handle a regular line with text and/or tags
|
||||||
|
parsed_line: ParsedLine = []
|
||||||
|
parts = self._LINE_SPLIT_RE.split(full_line)
|
||||||
|
for part in parts:
|
||||||
|
if not part:
|
||||||
|
continue
|
||||||
|
if part.startswith("[") and part.endswith("]"):
|
||||||
|
# It's a tag
|
||||||
|
if part == "[[r]]": # Special case from source, though rare
|
||||||
|
parsed_line.append(TextNode("[r]"))
|
||||||
|
elif part == "[[[[": # Another edge case
|
||||||
|
parsed_line.append(TextNode("[["))
|
||||||
|
elif part.startswith("[["): # Escaped literal text
|
||||||
|
parsed_line.append(TextNode(part[1:]))
|
||||||
|
else:
|
||||||
|
parsed_line.append(self._parse_tag_or_command(part[1:-1]))
|
||||||
|
else:
|
||||||
|
# It's plain text
|
||||||
|
parsed_line.append(TextNode(part))
|
||||||
|
|
||||||
|
if parsed_line:
|
||||||
|
self.parsed_script.append(parsed_line)
|
||||||
|
|
||||||
|
return self.parsed_script
|
||||||
|
|
||||||
|
def serialize(script: ParsedScript) -> str:
|
||||||
|
"""
|
||||||
|
Serializes a complete parsed script (AST) back into a KAG script string.
|
||||||
|
"""
|
||||||
|
lines = []
|
||||||
|
for node in script:
|
||||||
|
if isinstance(node, list):
|
||||||
|
# This is a ParsedLine, a mix of text and tags
|
||||||
|
line_parts = [sub_node.serialize() for sub_node in node]
|
||||||
|
lines.append("".join(line_parts))
|
||||||
|
else:
|
||||||
|
# This is a standalone node (Comment, Label, Command, etc.)
|
||||||
|
lines.append(node.serialize())
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
parser = ArgumentParser("KAGParser", description="KAGParser Demo")
|
||||||
|
parser.add_argument("file")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
script = KAGScriptParser(Path(args.file).read_text()).parse(
|
||||||
|
perserve_empty_lines=True
|
||||||
|
)
|
||||||
|
for ln in script:
|
||||||
|
print(ln)
|
||||||
135
iinkai.py
Normal file
135
iinkai.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
from KAGParser import *
|
||||||
|
from html import escape
|
||||||
|
import json
|
||||||
|
from os.path import isdir, join, basename, splitext, dirname
|
||||||
|
from os import listdir, makedirs
|
||||||
|
from typing import List
|
||||||
|
from csv import DictWriter
|
||||||
|
|
||||||
|
|
||||||
|
def extract_script(script_path: str, output_path: str):
|
||||||
|
with open(script_path, "r", encoding="utf-8") as f:
|
||||||
|
script_text = f.read()
|
||||||
|
parser = KAGScriptParser(script_text)
|
||||||
|
script = parser.parse(True)
|
||||||
|
name = None
|
||||||
|
message = ''
|
||||||
|
result = []
|
||||||
|
for line in script:
|
||||||
|
if isinstance(line, CommandNode):
|
||||||
|
cmd = line
|
||||||
|
if cmd.name == 'page':
|
||||||
|
d = {}
|
||||||
|
if name is not None:
|
||||||
|
d['name'] = name
|
||||||
|
d['message'] = message
|
||||||
|
message = ''
|
||||||
|
result.append(d)
|
||||||
|
name = None
|
||||||
|
elif cmd.name.startswith("【") and cmd.name.endswith("】"):
|
||||||
|
name = cmd.name[1:-1]
|
||||||
|
elif isinstance(line, list):
|
||||||
|
for node in line:
|
||||||
|
if isinstance(node, TextNode):
|
||||||
|
message += node.text.replace("&", "&").replace("<", "<")
|
||||||
|
elif isinstance(node, TagNode):
|
||||||
|
data = f"<{escape(node.name)}"
|
||||||
|
for k, v in node.attributes.items():
|
||||||
|
data += f' {escape(k)}="{escape(v)}"'
|
||||||
|
data += ">"
|
||||||
|
message += data
|
||||||
|
if name is not None or message:
|
||||||
|
d = {}
|
||||||
|
if name is not None:
|
||||||
|
d['name'] = name
|
||||||
|
d['message'] = message
|
||||||
|
result.append(d)
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_dict(script_path: str, output_path: str):
|
||||||
|
with open(script_path, "r", encoding="utf-8") as f:
|
||||||
|
script_text = f.read()
|
||||||
|
in_dict = False
|
||||||
|
dict_data = '\n'
|
||||||
|
for line in script_text.splitlines():
|
||||||
|
if line == "var text = '":
|
||||||
|
in_dict = True
|
||||||
|
elif line == "';":
|
||||||
|
in_dict = False
|
||||||
|
elif in_dict:
|
||||||
|
dict_data += line + "\n"
|
||||||
|
script = KAGScriptParser(dict_data).parse(True)
|
||||||
|
dict = {}
|
||||||
|
label = None
|
||||||
|
term: List[str] = []
|
||||||
|
for line in script:
|
||||||
|
if isinstance(line, LabelNode):
|
||||||
|
label = line.name[1:]
|
||||||
|
elif isinstance(line, list):
|
||||||
|
if len(term) > 0:
|
||||||
|
term.append('\n')
|
||||||
|
for node in line:
|
||||||
|
if isinstance(node, TextNode):
|
||||||
|
term.append(node.text)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unexpected node type in dict.scn", node)
|
||||||
|
elif isinstance(line, CommandNode):
|
||||||
|
if line.name == "return":
|
||||||
|
# 干掉名字
|
||||||
|
term.pop(0)
|
||||||
|
term.pop(0)
|
||||||
|
# 干掉傻逼片假名
|
||||||
|
if len(term) > 0 and term[0].startswith("【") and term[0].endswith("】"):
|
||||||
|
term.pop(0)
|
||||||
|
term.pop(0) # 去掉换行
|
||||||
|
dict[label] = ''.join(term)
|
||||||
|
term = []
|
||||||
|
label = None
|
||||||
|
with open(output_path, "w", encoding="utf-8-sig", newline="") as f:
|
||||||
|
writer = DictWriter(f, fieldnames=["term", "translation", "description"], lineterminator="\n")
|
||||||
|
writer.writeheader()
|
||||||
|
for k, v in dict.items():
|
||||||
|
writer.writerow({"term": k, "translation": "", "description": v})
|
||||||
|
|
||||||
|
def extract_script_auto(script_path: str, output_path: str):
|
||||||
|
if isdir(script_path):
|
||||||
|
for file in listdir(script_path):
|
||||||
|
if not file.lower().endswith(".scn"):
|
||||||
|
continue
|
||||||
|
full_path = join(script_path, file)
|
||||||
|
output_file = splitext(basename(file))[0]
|
||||||
|
if file == "dict.scn":
|
||||||
|
output_file += ".csv"
|
||||||
|
else:
|
||||||
|
output_file += ".json"
|
||||||
|
output_full_path = join(output_path, output_file)
|
||||||
|
pdir = dirname(output_full_path)
|
||||||
|
if pdir and not isdir(pdir):
|
||||||
|
makedirs(pdir, exist_ok=True)
|
||||||
|
if file == "dict.scn":
|
||||||
|
extract_dict(full_path, output_full_path)
|
||||||
|
else:
|
||||||
|
extract_script(full_path, output_full_path)
|
||||||
|
else:
|
||||||
|
pdir = dirname(output_path)
|
||||||
|
if pdir and not isdir(pdir):
|
||||||
|
makedirs(pdir, exist_ok=True)
|
||||||
|
base_name = basename(script_path)
|
||||||
|
if base_name == "dict.scn":
|
||||||
|
extract_dict(script_path, output_path)
|
||||||
|
else:
|
||||||
|
extract_script(script_path, output_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
parser = ArgumentParser(description="Process KAG script files")
|
||||||
|
subparser = parser.add_subparsers(title="Commands", dest="command")
|
||||||
|
extract_parser = subparser.add_parser("extract", help="Extract script to JSON")
|
||||||
|
extract_parser.add_argument("script_path", help="Path to KAG script file or directory")
|
||||||
|
extract_parser.add_argument("output_path", help="Path to output JSON file or directory")
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.command == "extract":
|
||||||
|
extract_script_auto(args.script_path, args.output_path)
|
||||||
Reference in New Issue
Block a user