From 960d11779469ef502453478d6a75da12787cea18 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Tue, 28 Apr 2026 16:26:02 +0800 Subject: [PATCH] Add iinkai extract script --- KAGParser.py | 310 +++++++++++++++++++++++++++++++++++++++++++++++++++ iinkai.py | 135 ++++++++++++++++++++++ 2 files changed, 445 insertions(+) create mode 100644 KAGParser.py create mode 100644 iinkai.py diff --git a/KAGParser.py b/KAGParser.py new file mode 100644 index 0000000..f1334a5 --- /dev/null +++ b/KAGParser.py @@ -0,0 +1,310 @@ +# SPDX-License-Identifier: LicenseRef-Proprietary +from __future__ import annotations + +import re +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, Dict, List, Union + +# --- Node Definitions --- +# We use dataclasses to represent the different types of parsed elements. + + +class INode(ABC): + @abstractmethod + def serialize(self) -> str: + pass + + +@dataclass +class CommentNode(INode): + text: str + + def __repr__(self): + return f"Comment('{self.text}')" + + def serialize(self) -> str: + return f"; {self.text}" + + +@dataclass +class LabelNode(INode): + name: str + page: str = "" + + def __repr__(self): + return f"Label(name='{self.name}', page='{self.page}')" + + def serialize(self) -> str: + if self.page: + return f"*{self.name}|{self.page}" + return f"*{self.name}" + + +@dataclass +class TextNode(INode): + text: str + + def __repr__(self): + return f"Text('{self.text}')" + + def serialize(self) -> str: + # In KAG, a literal '[' is escaped as '[['. + return self.text.replace("[", "[[") + + +@dataclass +class EmptyLineNode(INode): + def __repr__(self) -> str: + return f"EmptyLine" + + def serialize(self) -> str: + return "" + + +@dataclass +class TagNode(INode): + name: str + attributes: Dict[str, Any] = field(default_factory=dict) + + def __repr__(self): + return f"Tag(name='{self.name}', attributes={self.attributes})" + + def _serialize_attributes(self) -> str: + """Helper to convert the attribute dictionary to a string.""" + parts = [] + for key, value in self.attributes.items(): + if value is True: + parts.append(key) + else: + val_str = str(value) + # Quote the value if it contains spaces to ensure it's parsed correctly. + if " " in val_str or "=" in val_str: + parts.append(f'{key}="{val_str}"') + else: + parts.append(f"{key}={val_str}") + return " ".join(parts) + + def serialize(self) -> str: + attr_str = self._serialize_attributes() + if attr_str: + return f"[{self.name} {attr_str}]" + return f"[{self.name}]" + + +@dataclass +class CommandNode(TagNode): # A command is just a tag with a different syntax + def __repr__(self): + return f"Command(name='{self.name}', attributes={self.attributes})" + + def serialize(self) -> str: + attr_str = self._serialize_attributes() + if attr_str: + return f"@{self.name} {attr_str}" + return f"@{self.name}" + + +@dataclass +class ScriptBlockNode(INode): + script: str + + def __repr__(self): + return f"ScriptBlock(script='{self.script[:30]}...')" + + def serialize(self) -> str: + return f"[iscript]\n{self.script}\n[endscript]" + + +# A line can contain a mix of text and tags +ParsedLine = List[Union[TextNode, TagNode]] + +# The final parsed script is a list of different node types +ParsedScript = List[ + Union[CommentNode, LabelNode, CommandNode, ScriptBlockNode, ParsedLine] +] + + +class KAGScriptParser: + """ + Parses a KAG (.ks) script file into a structured list of nodes. + """ + + # Regex to split a line into text and tags. It keeps the delimiters (the tags). + _LINE_SPLIT_RE = re.compile(r"(\[.*?\])") + + # Regex to parse attributes within a tag/command string. + # It handles: key=value, key="value", key='value', and boolean keys. + _ATTR_RE = re.compile( + r""" + ([a-zA-Z0-9_]+) # Attribute key + (?: + = # Equals sign + ( + "[^"]*" | # Double-quoted value + '[^']*' | # Single-quoted value + [^\s\]]+ # Unquoted value + ) + )? # The entire value part is optional + """, + re.VERBOSE, + ) + + def __init__(self, script_text: str): + self.lines = script_text.splitlines() + self.parsed_script: ParsedScript = [] + + def _parse_attributes(self, attr_string: str) -> Dict[str, Any]: + """Parses the attribute string of a tag or command.""" + attributes = {} + for match in self._ATTR_RE.finditer(attr_string): + key = match.group(1) + value = match.group(2) + + if value is None: + # Boolean attribute, like [p clickable] + attributes[key] = True + else: + # Un-quote if necessary + if value.startswith('"') and value.endswith('"'): + value = value[1:-1] + elif value.startswith("'") and value.endswith("'"): + value = value[1:-1] + + # As per C++ code, unescape the ` character + value = value.replace("`", "") + attributes[key] = value + return attributes + + def _parse_tag_or_command( + self, content: str, is_command: bool = False + ) -> Union[TagNode, CommandNode]: + """Parses a tag [name attr=val] or command @name attr=val.""" + parts = content.strip().split(maxsplit=1) + tag_name = parts[0] + attr_string = parts[1] if len(parts) > 1 else "" + + attributes = self._parse_attributes(attr_string) + + NodeClass = CommandNode if is_command else TagNode + return NodeClass(name=tag_name, attributes=attributes) + + def parse(self, perserve_empty_lines=False) -> ParsedScript: + """ + Executes the parsing process on the entire script. + Returns a list of parsed nodes. + """ + self.parsed_script = [] + in_script_block = False + script_buffer = [] + + i = 0 + while i < len(self.lines): + line = self.lines[i].strip() + i += 1 + + if not line: + if perserve_empty_lines: + self.parsed_script.append(EmptyLineNode()) + else: + continue # Skip empty lines + + # Handle script blocks [iscript]...[endscript] + if in_script_block: + if line == "[endscript]": + in_script_block = False + self.parsed_script.append(ScriptBlockNode("\n".join(script_buffer))) + script_buffer = [] + else: + script_buffer.append(line) + continue + + if line == "[iscript]": + in_script_block = True + continue + + # Handle comments + if line.startswith(";"): + self.parsed_script.append(CommentNode(line[1:].lstrip())) + continue + + # Handle labels + if line.startswith("*"): + label_part = line[1:] + if "|" in label_part: + name, page = label_part.split("|", 1) + self.parsed_script.append(LabelNode(name, page)) + else: + self.parsed_script.append(LabelNode(label_part)) + continue + + # Handle commands + if line.startswith("@"): + self.parsed_script.append( + self._parse_tag_or_command(line[1:], is_command=True) + ) + continue + + # Handle line continuation + full_line = line + while full_line.endswith("\\"): + full_line = full_line[:-1].rstrip() + if i < len(self.lines): + full_line += " " + self.lines[i].strip() + i += 1 + else: + break + + # Handle a regular line with text and/or tags + parsed_line: ParsedLine = [] + parts = self._LINE_SPLIT_RE.split(full_line) + for part in parts: + if not part: + continue + if part.startswith("[") and part.endswith("]"): + # It's a tag + if part == "[[r]]": # Special case from source, though rare + parsed_line.append(TextNode("[r]")) + elif part == "[[[[": # Another edge case + parsed_line.append(TextNode("[[")) + elif part.startswith("[["): # Escaped literal text + parsed_line.append(TextNode(part[1:])) + else: + parsed_line.append(self._parse_tag_or_command(part[1:-1])) + else: + # It's plain text + parsed_line.append(TextNode(part)) + + if parsed_line: + self.parsed_script.append(parsed_line) + + return self.parsed_script + + def serialize(script: ParsedScript) -> str: + """ + Serializes a complete parsed script (AST) back into a KAG script string. + """ + lines = [] + for node in script: + if isinstance(node, list): + # This is a ParsedLine, a mix of text and tags + line_parts = [sub_node.serialize() for sub_node in node] + lines.append("".join(line_parts)) + else: + # This is a standalone node (Comment, Label, Command, etc.) + lines.append(node.serialize()) + return "\n".join(lines) + + +if __name__ == "__main__": + from argparse import ArgumentParser + from pathlib import Path + + parser = ArgumentParser("KAGParser", description="KAGParser Demo") + parser.add_argument("file") + args = parser.parse_args() + + script = KAGScriptParser(Path(args.file).read_text()).parse( + perserve_empty_lines=True + ) + for ln in script: + print(ln) diff --git a/iinkai.py b/iinkai.py new file mode 100644 index 0000000..27fd489 --- /dev/null +++ b/iinkai.py @@ -0,0 +1,135 @@ +from KAGParser import * +from html import escape +import json +from os.path import isdir, join, basename, splitext, dirname +from os import listdir, makedirs +from typing import List +from csv import DictWriter + + +def extract_script(script_path: str, output_path: str): + with open(script_path, "r", encoding="utf-8") as f: + script_text = f.read() + parser = KAGScriptParser(script_text) + script = parser.parse(True) + name = None + message = '' + result = [] + for line in script: + if isinstance(line, CommandNode): + cmd = line + if cmd.name == 'page': + d = {} + if name is not None: + d['name'] = name + d['message'] = message + message = '' + result.append(d) + name = None + elif cmd.name.startswith("【") and cmd.name.endswith("】"): + name = cmd.name[1:-1] + elif isinstance(line, list): + for node in line: + if isinstance(node, TextNode): + message += node.text.replace("&", "&").replace("<", "<") + elif isinstance(node, TagNode): + data = f"<{escape(node.name)}" + for k, v in node.attributes.items(): + data += f' {escape(k)}="{escape(v)}"' + data += ">" + message += data + if name is not None or message: + d = {} + if name is not None: + d['name'] = name + d['message'] = message + result.append(d) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + +def extract_dict(script_path: str, output_path: str): + with open(script_path, "r", encoding="utf-8") as f: + script_text = f.read() + in_dict = False + dict_data = '\n' + for line in script_text.splitlines(): + if line == "var text = '": + in_dict = True + elif line == "';": + in_dict = False + elif in_dict: + dict_data += line + "\n" + script = KAGScriptParser(dict_data).parse(True) + dict = {} + label = None + term: List[str] = [] + for line in script: + if isinstance(line, LabelNode): + label = line.name[1:] + elif isinstance(line, list): + if len(term) > 0: + term.append('\n') + for node in line: + if isinstance(node, TextNode): + term.append(node.text) + else: + raise ValueError("Unexpected node type in dict.scn", node) + elif isinstance(line, CommandNode): + if line.name == "return": + # 干掉名字 + term.pop(0) + term.pop(0) + # 干掉傻逼片假名 + if len(term) > 0 and term[0].startswith("【") and term[0].endswith("】"): + term.pop(0) + term.pop(0) # 去掉换行 + dict[label] = ''.join(term) + term = [] + label = None + with open(output_path, "w", encoding="utf-8-sig", newline="") as f: + writer = DictWriter(f, fieldnames=["term", "translation", "description"], lineterminator="\n") + writer.writeheader() + for k, v in dict.items(): + writer.writerow({"term": k, "translation": "", "description": v}) + +def extract_script_auto(script_path: str, output_path: str): + if isdir(script_path): + for file in listdir(script_path): + if not file.lower().endswith(".scn"): + continue + full_path = join(script_path, file) + output_file = splitext(basename(file))[0] + if file == "dict.scn": + output_file += ".csv" + else: + output_file += ".json" + output_full_path = join(output_path, output_file) + pdir = dirname(output_full_path) + if pdir and not isdir(pdir): + makedirs(pdir, exist_ok=True) + if file == "dict.scn": + extract_dict(full_path, output_full_path) + else: + extract_script(full_path, output_full_path) + else: + pdir = dirname(output_path) + if pdir and not isdir(pdir): + makedirs(pdir, exist_ok=True) + base_name = basename(script_path) + if base_name == "dict.scn": + extract_dict(script_path, output_path) + else: + extract_script(script_path, output_path) + + +if __name__ == "__main__": + from argparse import ArgumentParser + parser = ArgumentParser(description="Process KAG script files") + subparser = parser.add_subparsers(title="Commands", dest="command") + extract_parser = subparser.add_parser("extract", help="Extract script to JSON") + extract_parser.add_argument("script_path", help="Path to KAG script file or directory") + extract_parser.add_argument("output_path", help="Path to output JSON file or directory") + args = parser.parse_args() + if args.command == "extract": + extract_script_auto(args.script_path, args.output_path)