Add iinkai extract script

2026-04-28 16:26:02 +08:00
parent 222dbd221a
commit 960d117794
2 changed files with 445 additions and 0 deletions
--- a/KAGParser.py
+++ b/KAGParser.py
@@ -0,0 +1,310 @@
+# SPDX-License-Identifier: LicenseRef-Proprietary
+from __future__ import annotations
+
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Union
+
+# --- Node Definitions ---
+# We use dataclasses to represent the different types of parsed elements.
+
+
+class INode(ABC):
+    @abstractmethod
+    def serialize(self) -> str:
+        pass
+
+
+@dataclass
+class CommentNode(INode):
+    text: str
+
+    def __repr__(self):
+        return f"Comment('{self.text}')"
+
+    def serialize(self) -> str:
+        return f"; {self.text}"
+
+
+@dataclass
+class LabelNode(INode):
+    name: str
+    page: str = ""
+
+    def __repr__(self):
+        return f"Label(name='{self.name}', page='{self.page}')"
+
+    def serialize(self) -> str:
+        if self.page:
+            return f"*{self.name}|{self.page}"
+        return f"*{self.name}"
+
+
+@dataclass
+class TextNode(INode):
+    text: str
+
+    def __repr__(self):
+        return f"Text('{self.text}')"
+
+    def serialize(self) -> str:
+        # In KAG, a literal '[' is escaped as '[['.
+        return self.text.replace("[", "[[")
+
+
+@dataclass
+class EmptyLineNode(INode):
+    def __repr__(self) -> str:
+        return f"EmptyLine"
+
+    def serialize(self) -> str:
+        return ""
+
+
+@dataclass
+class TagNode(INode):
+    name: str
+    attributes: Dict[str, Any] = field(default_factory=dict)
+
+    def __repr__(self):
+        return f"Tag(name='{self.name}', attributes={self.attributes})"
+
+    def _serialize_attributes(self) -> str:
+        """Helper to convert the attribute dictionary to a string."""
+        parts = []
+        for key, value in self.attributes.items():
+            if value is True:
+                parts.append(key)
+            else:
+                val_str = str(value)
+                # Quote the value if it contains spaces to ensure it's parsed correctly.
+                if " " in val_str or "=" in val_str:
+                    parts.append(f'{key}="{val_str}"')
+                else:
+                    parts.append(f"{key}={val_str}")
+        return " ".join(parts)
+
+    def serialize(self) -> str:
+        attr_str = self._serialize_attributes()
+        if attr_str:
+            return f"[{self.name} {attr_str}]"
+        return f"[{self.name}]"
+
+
+@dataclass
+class CommandNode(TagNode):  # A command is just a tag with a different syntax
+    def __repr__(self):
+        return f"Command(name='{self.name}', attributes={self.attributes})"
+
+    def serialize(self) -> str:
+        attr_str = self._serialize_attributes()
+        if attr_str:
+            return f"@{self.name} {attr_str}"
+        return f"@{self.name}"
+
+
+@dataclass
+class ScriptBlockNode(INode):
+    script: str
+
+    def __repr__(self):
+        return f"ScriptBlock(script='{self.script[:30]}...')"
+
+    def serialize(self) -> str:
+        return f"[iscript]\n{self.script}\n[endscript]"
+
+
+# A line can contain a mix of text and tags
+ParsedLine = List[Union[TextNode, TagNode]]
+
+# The final parsed script is a list of different node types
+ParsedScript = List[
+    Union[CommentNode, LabelNode, CommandNode, ScriptBlockNode, ParsedLine]
+]
+
+
+class KAGScriptParser:
+    """
+    Parses a KAG (.ks) script file into a structured list of nodes.
+    """
+
+    # Regex to split a line into text and tags. It keeps the delimiters (the tags).
+    _LINE_SPLIT_RE = re.compile(r"(\[.*?\])")
+
+    # Regex to parse attributes within a tag/command string.
+    # It handles: key=value, key="value", key='value', and boolean keys.
+    _ATTR_RE = re.compile(
+        r"""
+        ([a-zA-Z0-9_]+)      # Attribute key
+        (?:
+          =                 # Equals sign
+          (
+            "[^"]*" |       # Double-quoted value
+            '[^']*' |       # Single-quoted value
+            [^\s\]]+        # Unquoted value
+          )
+        )?                  # The entire value part is optional
+        """,
+        re.VERBOSE,
+    )
+
+    def __init__(self, script_text: str):
+        self.lines = script_text.splitlines()
+        self.parsed_script: ParsedScript = []
+
+    def _parse_attributes(self, attr_string: str) -> Dict[str, Any]:
+        """Parses the attribute string of a tag or command."""
+        attributes = {}
+        for match in self._ATTR_RE.finditer(attr_string):
+            key = match.group(1)
+            value = match.group(2)
+
+            if value is None:
+                # Boolean attribute, like [p clickable]
+                attributes[key] = True
+            else:
+                # Un-quote if necessary
+                if value.startswith('"') and value.endswith('"'):
+                    value = value[1:-1]
+                elif value.startswith("'") and value.endswith("'"):
+                    value = value[1:-1]
+
+                # As per C++ code, unescape the ` character
+                value = value.replace("`", "")
+                attributes[key] = value
+        return attributes
+
+    def _parse_tag_or_command(
+        self, content: str, is_command: bool = False
+    ) -> Union[TagNode, CommandNode]:
+        """Parses a tag [name attr=val] or command @name attr=val."""
+        parts = content.strip().split(maxsplit=1)
+        tag_name = parts[0]
+        attr_string = parts[1] if len(parts) > 1 else ""
+
+        attributes = self._parse_attributes(attr_string)
+
+        NodeClass = CommandNode if is_command else TagNode
+        return NodeClass(name=tag_name, attributes=attributes)
+
+    def parse(self, perserve_empty_lines=False) -> ParsedScript:
+        """
+        Executes the parsing process on the entire script.
+        Returns a list of parsed nodes.
+        """
+        self.parsed_script = []
+        in_script_block = False
+        script_buffer = []
+
+        i = 0
+        while i < len(self.lines):
+            line = self.lines[i].strip()
+            i += 1
+
+            if not line:
+                if perserve_empty_lines:
+                    self.parsed_script.append(EmptyLineNode())
+                else:
+                    continue  # Skip empty lines
+
+            # Handle script blocks [iscript]...[endscript]
+            if in_script_block:
+                if line == "[endscript]":
+                    in_script_block = False
+                    self.parsed_script.append(ScriptBlockNode("\n".join(script_buffer)))
+                    script_buffer = []
+                else:
+                    script_buffer.append(line)
+                continue
+
+            if line == "[iscript]":
+                in_script_block = True
+                continue
+
+            # Handle comments
+            if line.startswith(";"):
+                self.parsed_script.append(CommentNode(line[1:].lstrip()))
+                continue
+
+            # Handle labels
+            if line.startswith("*"):
+                label_part = line[1:]
+                if "|" in label_part:
+                    name, page = label_part.split("|", 1)
+                    self.parsed_script.append(LabelNode(name, page))
+                else:
+                    self.parsed_script.append(LabelNode(label_part))
+                continue
+
+            # Handle commands
+            if line.startswith("@"):
+                self.parsed_script.append(
+                    self._parse_tag_or_command(line[1:], is_command=True)
+                )
+                continue
+
+            # Handle line continuation
+            full_line = line
+            while full_line.endswith("\\"):
+                full_line = full_line[:-1].rstrip()
+                if i < len(self.lines):
+                    full_line += " " + self.lines[i].strip()
+                    i += 1
+                else:
+                    break
+
+            # Handle a regular line with text and/or tags
+            parsed_line: ParsedLine = []
+            parts = self._LINE_SPLIT_RE.split(full_line)
+            for part in parts:
+                if not part:
+                    continue
+                if part.startswith("[") and part.endswith("]"):
+                    # It's a tag
+                    if part == "[[r]]":  # Special case from source, though rare
+                        parsed_line.append(TextNode("[r]"))
+                    elif part == "[[[[":  # Another edge case
+                        parsed_line.append(TextNode("[["))
+                    elif part.startswith("[["):  # Escaped literal text
+                        parsed_line.append(TextNode(part[1:]))
+                    else:
+                        parsed_line.append(self._parse_tag_or_command(part[1:-1]))
+                else:
+                    # It's plain text
+                    parsed_line.append(TextNode(part))
+
+            if parsed_line:
+                self.parsed_script.append(parsed_line)
+
+        return self.parsed_script
+
+    def serialize(script: ParsedScript) -> str:
+        """
+        Serializes a complete parsed script (AST) back into a KAG script string.
+        """
+        lines = []
+        for node in script:
+            if isinstance(node, list):
+                # This is a ParsedLine, a mix of text and tags
+                line_parts = [sub_node.serialize() for sub_node in node]
+                lines.append("".join(line_parts))
+            else:
+                # This is a standalone node (Comment, Label, Command, etc.)
+                lines.append(node.serialize())
+        return "\n".join(lines)
+
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    from pathlib import Path
+
+    parser = ArgumentParser("KAGParser", description="KAGParser Demo")
+    parser.add_argument("file")
+    args = parser.parse_args()
+
+    script = KAGScriptParser(Path(args.file).read_text()).parse(
+        perserve_empty_lines=True
+    )
+    for ln in script:
+        print(ln)
--- a/iinkai.py
+++ b/iinkai.py
@@ -0,0 +1,135 @@
+from KAGParser import *
+from html import escape
+import json
+from os.path import isdir, join, basename, splitext, dirname
+from os import listdir, makedirs
+from typing import List
+from csv import DictWriter
+
+
+def extract_script(script_path: str, output_path: str):
+    with open(script_path, "r", encoding="utf-8") as f:
+        script_text = f.read()
+    parser = KAGScriptParser(script_text)
+    script = parser.parse(True)
+    name = None
+    message = ''
+    result = []
+    for line in script:
+        if isinstance(line, CommandNode):
+            cmd = line
+            if cmd.name == 'page':
+                d = {}
+                if name is not None:
+                    d['name'] = name
+                d['message'] = message
+                message = ''
+                result.append(d)
+                name = None
+            elif cmd.name.startswith("【") and cmd.name.endswith("】"):
+                name = cmd.name[1:-1]
+        elif isinstance(line, list):
+            for node in line:
+                if isinstance(node, TextNode):
+                    message += node.text.replace("&", "&amp;").replace("<", "&lt;")
+                elif isinstance(node, TagNode):
+                    data = f"<{escape(node.name)}"
+                    for k, v in node.attributes.items():
+                        data += f' {escape(k)}="{escape(v)}"'
+                    data += ">"
+                    message += data
+    if name is not None or message:
+        d = {}
+        if name is not None:
+            d['name'] = name
+        d['message'] = message
+        result.append(d)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+
+
+def extract_dict(script_path: str, output_path: str):
+    with open(script_path, "r", encoding="utf-8") as f:
+        script_text = f.read()
+    in_dict = False
+    dict_data = '\n'
+    for line in script_text.splitlines():
+        if line == "var text = '":
+            in_dict = True
+        elif line == "';":
+            in_dict = False
+        elif in_dict:
+            dict_data += line + "\n"
+    script = KAGScriptParser(dict_data).parse(True)
+    dict = {}
+    label = None
+    term: List[str] = []
+    for line in script:
+        if isinstance(line, LabelNode):
+            label = line.name[1:]
+        elif isinstance(line, list):
+            if len(term) > 0:
+                term.append('\n')
+            for node in line:
+                if isinstance(node, TextNode):
+                    term.append(node.text)
+                else:
+                    raise ValueError("Unexpected node type in dict.scn", node)
+        elif isinstance(line, CommandNode):
+            if line.name == "return":
+                # 干掉名字
+                term.pop(0)
+                term.pop(0)
+                # 干掉傻逼片假名
+                if len(term) > 0 and term[0].startswith("【") and term[0].endswith("】"):
+                    term.pop(0)
+                    term.pop(0)  # 去掉换行
+                dict[label] = ''.join(term)
+                term = []
+                label = None
+    with open(output_path, "w", encoding="utf-8-sig", newline="") as f:
+        writer = DictWriter(f, fieldnames=["term", "translation", "description"], lineterminator="\n")
+        writer.writeheader()
+        for k, v in dict.items():
+            writer.writerow({"term": k, "translation": "", "description": v})
+
+def extract_script_auto(script_path: str, output_path: str):
+    if isdir(script_path):
+        for file in listdir(script_path):
+            if not file.lower().endswith(".scn"):
+                continue
+            full_path = join(script_path, file)
+            output_file = splitext(basename(file))[0]
+            if file == "dict.scn":
+                output_file += ".csv"
+            else:
+                output_file += ".json"
+            output_full_path = join(output_path, output_file)
+            pdir = dirname(output_full_path)
+            if pdir and not isdir(pdir):
+                makedirs(pdir, exist_ok=True)
+            if file == "dict.scn":
+                extract_dict(full_path, output_full_path)
+            else:
+                extract_script(full_path, output_full_path)
+    else:
+        pdir = dirname(output_path)
+        if pdir and not isdir(pdir):
+            makedirs(pdir, exist_ok=True)
+        base_name = basename(script_path)
+        if base_name == "dict.scn":
+            extract_dict(script_path, output_path)
+        else:
+            extract_script(script_path, output_path)
+
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="Process KAG script files")
+    subparser = parser.add_subparsers(title="Commands", dest="command")
+    extract_parser = subparser.add_parser("extract", help="Extract script to JSON")
+    extract_parser.add_argument("script_path", help="Path to KAG script file or directory")
+    extract_parser.add_argument("output_path", help="Path to output JSON file or directory")
+    args = parser.parse_args()
+    if args.command == "extract":
+        extract_script_auto(args.script_path, args.output_path)