Add kirikiri ks script extract support

This commit is contained in:
2025-07-02 10:46:45 +08:00
parent 7810e7a569
commit 4f3b4c5e78
9 changed files with 717 additions and 1 deletions

53
Cargo.lock generated
View File

@@ -14,6 +14,15 @@ version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "anstream"
version = "0.6.18"
@@ -70,6 +79,21 @@ version = "1.0.98"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487"
[[package]]
name = "bit-set"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
dependencies = [
"bit-vec",
]
[[package]]
name = "bit-vec"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
[[package]]
name = "bitflags"
version = "1.3.2"
@@ -291,6 +315,17 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "fancy-regex"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
dependencies = [
"bit-set",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "fdeflate"
version = "0.3.7"
@@ -418,6 +453,7 @@ dependencies = [
"csv",
"emote-psb",
"encoding_rs",
"fancy-regex",
"flate2",
"int-enum",
"lazy_static",
@@ -544,6 +580,23 @@ dependencies = [
"getrandom",
]
[[package]]
name = "regex-automata"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "ryu"
version = "1.0.20"

View File

@@ -10,6 +10,7 @@ clap = { version = "4.5", features = ["derive"] }
csv = "1.3"
emote-psb = { version = "0.5", optional = true }
encoding_rs = "0.8"
fancy-regex = { version = "0.14", optional = true }
flate2 = { version = "1.1", optional = true }
int-enum = { version = "1.2", optional = true }
lazy_static = "1.5.0"
@@ -33,7 +34,7 @@ cat-system-img = ["cat-system", "flate2", "image", "utils-bit-stream"]
circus = []
escude = ["int-enum"]
escude-arc = ["escude", "rand", "utils-bit-stream"]
kirikiri = ["emote-psb", "flate2"]
kirikiri = ["emote-psb", "fancy-regex", "flate2", "utils-escape"]
yaneurao = []
yaneurao-itufuru = ["yaneurao"]
# basic feature
@@ -41,6 +42,7 @@ image = ["png"]
# utils feature
utils-bit-stream = []
utils-crc32 = []
utils-escape = ["fancy-regex"]
[target.'cfg(windows)'.dependencies]
windows-sys = { version = "0", features = ["Win32_Globalization", "Win32_System_Diagnostics_Debug"] }

View File

@@ -123,6 +123,28 @@ pub struct Arg {
#[arg(long, global = true)]
/// Kirikiri COMU message translation file. (Map<String, String>, key is original text, value is translated text.)
pub kirikiri_comumode_json: Option<String>,
#[cfg(feature = "kirikiri")]
#[arg(long, global = true, action = ArgAction::SetTrue, alias = "kr-no-empty-lines", alias = "kirikiri-no-empty-lines")]
/// Remove empty lines in Kirikiri KS script.
pub kirikiri_remove_empty_lines: bool,
#[cfg(feature = "kirikiri")]
#[arg(
long,
global = true,
value_delimiter = ',',
default_value = "nm,set_title,speaker,Talk,talk,cn,name,名前"
)]
/// Kirikiri name commands, used to extract names from ks script.
pub kirikiri_name_commands: Vec<String>,
#[cfg(feature = "kirikiri")]
#[arg(
long,
global = true,
value_delimiter = ',',
default_value = "sel01,sel02,sel03,sel04,AddSelect,ruby,exlink,e_xlink"
)]
/// Kirikiri message commands, used to extract more message from ks script.
pub kirikiri_message_commands: Vec<String>,
#[command(subcommand)]
/// Command
pub command: Command,

View File

@@ -1365,6 +1365,16 @@ fn main() {
.kirikiri_comumode_json
.as_ref()
.map(|s| scripts::kirikiri::read_kirikiri_comu_json(s).unwrap()),
#[cfg(feature = "kirikiri")]
kirikiri_remove_empty_lines: arg.kirikiri_remove_empty_lines,
#[cfg(feature = "kirikiri")]
kirikiri_name_commands: std::sync::Arc::new(std::collections::HashSet::from_iter(
arg.kirikiri_name_commands.iter().cloned(),
)),
#[cfg(feature = "kirikiri")]
kirikiri_message_commands: std::sync::Arc::new(std::collections::HashSet::from_iter(
arg.kirikiri_message_commands.iter().cloned(),
)),
};
match &arg.command {
args::Command::Export { input, output } => {

551
src/scripts/kirikiri/ks.rs Normal file
View File

@@ -0,0 +1,551 @@
use crate::scripts::base::*;
use crate::types::*;
use crate::utils::encoding::*;
use crate::utils::escape::*;
use anyhow::Result;
use fancy_regex::Regex;
use std::collections::HashSet;
use std::io::Write;
use std::ops::{Deref, DerefMut};
use std::sync::Arc;
#[derive(Debug)]
pub struct KsBuilder {}
impl KsBuilder {
pub fn new() -> Self {
Self {}
}
}
impl ScriptBuilder for KsBuilder {
fn default_encoding(&self) -> Encoding {
Encoding::Cp932
}
fn build_script(
&self,
buf: Vec<u8>,
_filename: &str,
encoding: Encoding,
_archive_encoding: Encoding,
config: &ExtraConfig,
) -> Result<Box<dyn Script>> {
Ok(Box::new(KsScript::new(buf, encoding, config)?))
}
fn extensions(&self) -> &'static [&'static str] {
&["ks"]
}
fn script_type(&self) -> &'static ScriptType {
&ScriptType::Kirikiri
}
}
trait Node {
fn serialize(&self) -> String;
}
#[derive(Clone, Debug)]
struct CommentNode(String);
impl Node for CommentNode {
fn serialize(&self) -> String {
format!("; {}", self.0)
}
}
#[derive(Clone, Debug)]
struct LabelNode {
name: String,
page: Option<String>,
}
impl Node for LabelNode {
fn serialize(&self) -> String {
if let Some(page) = &self.page {
format!("*{}|{}", self.name, page)
} else {
format!("*{}", self.name)
}
}
}
#[derive(Clone, Debug)]
struct TextNode(String);
impl Node for TextNode {
fn serialize(&self) -> String {
// In KAG, [ is escaped as [[
self.0.replace("[", "[[")
}
}
#[derive(Clone, Debug)]
struct EmptyLineNode;
impl Node for EmptyLineNode {
fn serialize(&self) -> String {
String::new()
}
}
#[derive(Clone, Debug)]
enum TagAttr {
True,
Str(String),
}
#[derive(Clone, Debug)]
struct TagNode {
name: String,
attributes: Vec<(String, TagAttr)>,
}
impl TagNode {
fn serialize_attributes(&self) -> String {
let mut parts = Vec::new();
for (key, value) in self.attributes.iter() {
match value {
TagAttr::True => {
parts.push(key.clone());
}
TagAttr::Str(val) => {
if val.contains(" ") || val.contains("=") {
parts.push(format!("{}=\"{}\"", key, val));
} else {
parts.push(format!("{}={}", key, val));
}
}
}
}
parts.join(" ")
}
fn ser_attributes_xml(&self) -> String {
let mut parts = Vec::new();
for (key, value) in self.attributes.iter() {
match value {
TagAttr::True => {
parts.push(key.clone());
}
TagAttr::Str(val) => {
parts.push(format!("{}=\"{}\"", key, escape_xml_attr_value(val)));
}
}
}
parts.join(" ")
}
fn to_xml_tag(&self) -> String {
let attr_str = self.ser_attributes_xml();
if attr_str.is_empty() {
format!("<{}>", self.name)
} else {
format!("<{} {}>", self.name, attr_str)
}
}
}
impl Node for TagNode {
fn serialize(&self) -> String {
let attr_str = self.serialize_attributes();
if attr_str.is_empty() {
format!("[{}]", self.name)
} else {
format!("[{} {}]", self.name, attr_str)
}
}
}
#[derive(Clone)]
struct CommandNode {
inner: TagNode,
}
impl Deref for CommandNode {
type Target = TagNode;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
impl DerefMut for CommandNode {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.inner
}
}
impl std::fmt::Debug for CommandNode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CommandNode")
.field("name", &self.inner.name)
.field("attributes", &self.inner.attributes)
.finish()
}
}
impl Node for CommandNode {
fn serialize(&self) -> String {
let attr_str = self.inner.serialize_attributes();
if attr_str.is_empty() {
format!("@{}", self.inner.name)
} else {
format!("@{} {}", self.inner.name, attr_str)
}
}
}
#[derive(Clone, Debug)]
struct ScriptBlockNode(String);
impl Node for ScriptBlockNode {
fn serialize(&self) -> String {
format!("[iscript]\n{}\n[endscript]", self.0)
}
}
#[derive(Clone, Debug)]
enum ParsedLineNode {
Text(TextNode),
Tag(TagNode),
}
impl ParsedLineNode {
fn to_xml(&self) -> String {
match self {
ParsedLineNode::Text(text_node) => escape_xml_text_value(&text_node.0),
ParsedLineNode::Tag(tag_node) => {
if tag_node.name == "r" && tag_node.attributes.is_empty() {
"\n".to_string()
} else {
tag_node.to_xml_tag()
}
}
}
}
}
impl Node for ParsedLineNode {
fn serialize(&self) -> String {
match self {
ParsedLineNode::Text(text_node) => text_node.serialize(),
ParsedLineNode::Tag(tag_node) => tag_node.serialize(),
}
}
}
#[derive(Clone, Debug)]
struct ParsedLine(Vec<ParsedLineNode>);
impl ParsedLine {
fn to_xml(&self) -> String {
let mut s = String::new();
for node in &self.0 {
s.push_str(&node.to_xml());
}
s
}
}
impl Node for ParsedLine {
fn serialize(&self) -> String {
self.0
.iter()
.map(|node| node.serialize())
.collect::<Vec<_>>()
.join("")
}
}
#[derive(Clone, Debug)]
enum ParsedScriptNode {
Comment(CommentNode),
Label(LabelNode),
Command(CommandNode),
ScriptBlock(ScriptBlockNode),
Line(ParsedLine),
EmptyLine(EmptyLineNode),
}
impl Node for ParsedScriptNode {
fn serialize(&self) -> String {
match self {
ParsedScriptNode::Comment(comment) => comment.serialize(),
ParsedScriptNode::Label(label) => label.serialize(),
ParsedScriptNode::Command(command) => command.serialize(),
ParsedScriptNode::ScriptBlock(script_block) => script_block.serialize(),
ParsedScriptNode::Line(line) => line.serialize(),
ParsedScriptNode::EmptyLine(empty_line) => empty_line.serialize(),
}
}
}
#[derive(Clone, Debug)]
struct ParsedScript(Vec<ParsedScriptNode>);
impl ParsedScript {
fn iter(&self) -> impl Iterator<Item = &ParsedScriptNode> {
self.0.iter()
}
fn iter_mut(&mut self) -> impl Iterator<Item = &mut ParsedScriptNode> {
self.0.iter_mut()
}
}
impl Node for ParsedScript {
fn serialize(&self) -> String {
self.0
.iter()
.map(|node| node.serialize())
.collect::<Vec<_>>()
.join("\n")
}
}
lazy_static::lazy_static! {
static ref LINE_SPLIT_RE: Regex = Regex::new(r"(\[.*?\])").unwrap();
static ref ATTR_RE: Regex = Regex::new("([a-zA-Z0-9_]+)(?:=(\"[^\"]*\" |'[^']*' |[^\\s\\]]+))?").unwrap();
}
struct Parser {
lines: Vec<String>,
}
impl Parser {
pub fn new(script: &str) -> Self {
let lines = script.lines().map(|s| s.to_string()).collect();
Self { lines }
}
fn parse_attributes(attr_str: &str) -> Result<Vec<(String, TagAttr)>> {
let mut attributes = Vec::new();
for cap in ATTR_RE.captures_iter(attr_str) {
let cap = cap?;
let key = cap
.get(1)
.ok_or(anyhow::anyhow!("Invalid attribute key"))?
.as_str()
.to_string();
let value = cap
.get(2)
.map(|v| {
let mut s = v.as_str().to_string();
if s.starts_with("\"") && s.ends_with("\"") {
s = s[1..s.len() - 1].to_string();
} else if s.starts_with("'") && s.ends_with("'") {
s = s[1..s.len() - 1].to_string();
}
s = s.replace("`", "");
TagAttr::Str(s)
})
.unwrap_or(TagAttr::True);
attributes.push((key, value));
}
Ok(attributes)
}
fn parse_tag_or_command(content: &str) -> Result<TagNode> {
let parts = content.trim().split_ascii_whitespace().collect::<Vec<_>>();
let tag_name = parts[0].to_string();
let attr_string = parts[1..].join(" ");
let attrs = Self::parse_attributes(&attr_string)?;
Ok(TagNode {
name: tag_name,
attributes: attrs,
})
}
fn parse(&self, preserve_empty_lines: bool) -> Result<ParsedScript> {
let mut parsed_scripts = Vec::new();
let mut in_script_block = false;
let mut script_buffer = Vec::new();
let mut i = 0;
let line_count = self.lines.len();
while i < line_count {
let line = self.lines[i].trim();
i += 1;
if line.is_empty() {
if preserve_empty_lines {
parsed_scripts.push(ParsedScriptNode::EmptyLine(EmptyLineNode));
} else {
continue;
}
}
if in_script_block {
if line == "[endscript]" {
in_script_block = false;
parsed_scripts.push(ParsedScriptNode::ScriptBlock(ScriptBlockNode(
script_buffer.join("\n"),
)));
script_buffer.clear();
} else {
script_buffer.push(line.to_string());
}
continue;
}
if line == "[iscript]" {
in_script_block = true;
continue;
}
if line.starts_with(";") {
parsed_scripts.push(ParsedScriptNode::Comment(CommentNode(
line[1..].trim().to_string(),
)));
continue;
}
if line.starts_with("*") {
let parts: Vec<&str> = line.split('|').collect();
let label_name = parts[0][1..].trim().to_string();
let page = if parts.len() > 1 {
Some(parts[1..].join("|"))
} else {
None
};
parsed_scripts.push(ParsedScriptNode::Label(LabelNode {
name: label_name,
page,
}));
continue;
}
if line.starts_with("@") {
let content = &line[1..];
let tag_node = Self::parse_tag_or_command(content)?;
parsed_scripts.push(ParsedScriptNode::Command(CommandNode { inner: tag_node }));
continue;
}
let mut full_line = line.to_string();
while full_line.ends_with("\\") {
full_line.pop(); // Remove the trailing backslash
full_line = full_line.trim_end().to_string();
if i < line_count {
full_line.push(' ');
full_line.push_str(&self.lines[i].trim());
i += 1;
} else {
break; // No more lines to append
}
}
let mut parsed_line_nodes = Vec::new();
for part in LINE_SPLIT_RE.split(&full_line) {
let part = part?;
if part.is_empty() {
continue;
}
if part.starts_with("[") && part.ends_with("]") {
if part == "[[r]]" {
parsed_line_nodes.push(ParsedLineNode::Text(TextNode("[r]".to_string())));
} else if part == "[[[[" {
parsed_line_nodes.push(ParsedLineNode::Text(TextNode("[[".to_string())));
} else if part.starts_with("[[") {
parsed_line_nodes
.push(ParsedLineNode::Text(TextNode(part[1..].to_string())))
} else {
parsed_line_nodes.push(ParsedLineNode::Tag(Self::parse_tag_or_command(
&part[1..part.len() - 1],
)?));
}
}
}
if !parsed_line_nodes.is_empty() {
parsed_scripts.push(ParsedScriptNode::Line(ParsedLine(parsed_line_nodes)));
}
}
Ok(ParsedScript(parsed_scripts))
}
}
#[derive(Debug)]
pub struct KsScript {
bom: BomType,
tree: ParsedScript,
name_commands: Arc<HashSet<String>>,
message_commands: Arc<HashSet<String>>,
}
impl KsScript {
pub fn new(reader: Vec<u8>, encoding: Encoding, config: &ExtraConfig) -> Result<Self> {
let (text, bom) = decode_with_bom_detect(encoding, &reader)?;
let parser = Parser::new(&text);
let tree = parser.parse(!config.kirikiri_remove_empty_lines)?;
Ok(Self {
bom,
tree,
name_commands: config.kirikiri_name_commands.clone(),
message_commands: config.kirikiri_message_commands.clone(),
})
}
}
impl Script for KsScript {
fn default_output_script_type(&self) -> OutputScriptType {
OutputScriptType::Json
}
fn default_format_type(&self) -> FormatOptions {
FormatOptions::None
}
fn extract_messages(&self) -> Result<Vec<Message>> {
let mut messages = Vec::new();
let mut name = None;
for obj in self.tree.iter() {
match obj {
ParsedScriptNode::Line(line) => messages.push(Message {
name: name.take(),
message: line.to_xml(),
}),
ParsedScriptNode::Command(cmd) => {
if self.name_commands.contains(&cmd.name) {
for attr in &cmd.attributes {
if let TagAttr::Str(value) = &attr.1 {
if !value.is_empty() && !value.is_ascii() {
name = Some(value.clone());
break; // Only take the first name found
}
}
}
} else if self.message_commands.contains(&cmd.name) {
for attr in &cmd.attributes {
if let TagAttr::Str(value) = &attr.1 {
if !value.is_empty() && !value.is_ascii() {
messages.push(Message {
name: name.take(),
message: value.clone(),
});
break; // Only take the first message found
}
}
}
}
}
_ => {}
}
}
Ok(messages)
}
fn import_messages<'a>(
&'a self,
messages: Vec<Message>,
mut file: Box<dyn WriteSeek + 'a>,
encoding: Encoding,
_replacement: Option<&'a ReplacementTable>,
) -> Result<()> {
let mut mes = messages.iter();
let mut _cur_mes = mes.next();
let mut tree = self.tree.clone();
for obj in tree.iter_mut() {
match obj {
_ => {}
}
}
let s = tree.serialize();
let data = encode_string_with_bom(encoding, &s, false, self.bom)?;
file.write_all(&data)?;
Ok(())
}
}

View File

@@ -1,3 +1,4 @@
pub mod ks;
pub mod scn;
pub mod simple_crypt;
use std::collections::HashMap;

View File

@@ -213,6 +213,12 @@ pub struct ExtraConfig {
pub kirikiri_export_comumode: bool,
#[cfg(feature = "kirikiri")]
pub kirikiri_comumode_json: Option<std::sync::Arc<HashMap<String, String>>>,
#[cfg(feature = "kirikiri")]
pub kirikiri_remove_empty_lines: bool,
#[cfg(feature = "kirikiri")]
pub kirikiri_name_commands: std::sync::Arc<std::collections::HashSet<String>>,
#[cfg(feature = "kirikiri")]
pub kirikiri_message_commands: std::sync::Arc<std::collections::HashSet<String>>,
}
#[derive(Clone, Copy, Debug, ValueEnum, PartialEq, Eq, PartialOrd, Ord)]
@@ -276,6 +282,10 @@ pub enum ScriptType {
#[value(alias("kr-simple-crypt"))]
/// Kirikiri SimpleCrypt's text file
KirikiriSimpleCrypt,
#[cfg(feature = "kirikiri")]
#[value(alias = "kr", alias = "kr-ks", alias = "kirikiri-ks")]
/// Kirikiri script
Kirikiri,
#[cfg(feature = "yaneurao-itufuru")]
#[value(alias("itufuru"))]
/// Yaneurao Itufuru script

65
src/utils/escape.rs Normal file
View File

@@ -0,0 +1,65 @@
use fancy_regex::Regex;
pub fn escape_xml_attr_value(s: &str) -> String {
let mut escaped = String::with_capacity(s.len());
for c in s.chars() {
match c {
'&' => escaped.push_str("&amp;"),
'<' => escaped.push_str("&lt;"),
'"' => escaped.push_str("&quot;"),
'\'' => escaped.push_str("&apos;"),
_ => escaped.push(c),
}
}
escaped
}
pub fn escape_xml_text_value(s: &str) -> String {
let mut escaped = String::with_capacity(s.len());
for c in s.chars() {
match c {
'&' => escaped.push_str("&amp;"),
'<' => escaped.push_str("&lt;"),
'>' => escaped.push_str("&gt;"),
'"' => escaped.push_str("&quot;"),
'\'' => escaped.push_str("&apos;"),
_ => escaped.push(c),
}
}
escaped
}
lazy_static::lazy_static! {
static ref XML_NCR_BASE10_REGEX: Regex = Regex::new(r"&#(\d+);").unwrap();
static ref XML_NCR_BASE16_REGEX: Regex = Regex::new(r"&#x([0-9a-fA-F]+);").unwrap();
}
pub fn unescape_xml(s: &str) -> String {
let mut s = s.to_owned();
s = XML_NCR_BASE10_REGEX
.replace_all(&s, |caps: &fancy_regex::Captures| {
let codepoint = caps[1].parse::<u32>().unwrap_or(0);
char::from_u32(codepoint).map_or("".to_string(), |c| c.to_string())
})
.to_string();
s = XML_NCR_BASE16_REGEX
.replace_all(&s, |caps: &fancy_regex::Captures| {
let codepoint = u32::from_str_radix(&caps[1], 16).unwrap_or(0);
char::from_u32(codepoint).map_or("".to_string(), |c| c.to_string())
})
.to_string();
s.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&apos;", "'")
}
#[test]
fn test_unescape_xml() {
assert_eq!(
unescape_xml("Hello &amp;amp; World &lt;script&gt;alert(&#x27;XSS&#x27;)&lt;/script&gt;"),
"Hello &amp; World <script>alert('XSS')</script>"
);
assert_eq!(unescape_xml("&#20320;TEST&#x20;"), "你TEST ");
}

View File

@@ -6,6 +6,8 @@ pub mod crc32;
pub mod encoding;
#[cfg(windows)]
mod encoding_win;
#[cfg(feature = "utils-escape")]
pub mod escape;
pub mod files;
#[cfg(feature = "image")]
pub mod img;