From 601abba284029123193c2df360f44cc49175bd68 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Tue, 8 Jul 2025 10:20:21 +0800 Subject: [PATCH] Add support to support PUA in BGI string --- src/ext/io.rs | 2 +- src/main.rs | 8 +-- src/scripts/bgi/bp.rs | 2 +- src/scripts/bgi/bsi.rs | 8 +-- src/scripts/bgi/parser.rs | 3 +- src/scripts/bgi/script.rs | 9 ++- src/scripts/cat_system/archive/int.rs | 2 +- src/scripts/circus/script.rs | 6 +- src/scripts/escude/archive.rs | 4 +- src/scripts/escude/list.rs | 2 +- src/scripts/escude/script.rs | 4 +- src/scripts/kirikiri/image/dref.rs | 2 +- src/scripts/kirikiri/ks.rs | 2 +- src/scripts/kirikiri/scn.rs | 2 +- src/scripts/yaneurao/itufuru/script.rs | 2 +- src/utils/encoding.rs | 77 +++++++++++++++++--------- src/utils/encoding_win.rs | 29 +++++++--- 17 files changed, 104 insertions(+), 60 deletions(-) diff --git a/src/ext/io.rs b/src/ext/io.rs index 693a8d9..181c857 100644 --- a/src/ext/io.rs +++ b/src/ext/io.rs @@ -723,7 +723,7 @@ impl ReadExt for T { buf.truncate(pos); } } - let s = decode_to_string(encoding, &buf) + let s = decode_to_string(encoding, &buf, true) .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; Ok(s) } diff --git a/src/main.rs b/src/main.rs index d430330..d0aebef 100644 --- a/src/main.rs +++ b/src/main.rs @@ -895,7 +895,7 @@ pub fn import_script( continue; } }; - let s = match utils::encoding::decode_to_string(enc, &b) { + let s = match utils::encoding::decode_to_string(enc, &b, true) { Ok(s) => s, Err(e) => { eprintln!("Error decoding string: {}", e); @@ -922,7 +922,7 @@ pub fn import_script( continue; } }; - let s = match utils::encoding::decode_to_string(enc, &b) { + let s = match utils::encoding::decode_to_string(enc, &b, true) { Ok(s) => s, Err(e) => { eprintln!("Error decoding string: {}", e); @@ -1086,13 +1086,13 @@ pub fn import_script( types::OutputScriptType::Json => { let enc = get_output_encoding(arg); let b = utils::files::read_file(&out_f)?; - let s = utils::encoding::decode_to_string(enc, &b)?; + let s = utils::encoding::decode_to_string(enc, &b, true)?; serde_json::from_str::>(&s)? } types::OutputScriptType::M3t => { let enc = get_output_encoding(arg); let b = utils::files::read_file(&out_f)?; - let s = utils::encoding::decode_to_string(enc, &b)?; + let s = utils::encoding::decode_to_string(enc, &b, true)?; let mut parser = output_scripts::m3t::M3tParser::new(&s); parser.parse()? } diff --git a/src/scripts/bgi/bp.rs b/src/scripts/bgi/bp.rs index bd09ad9..d34ba6e 100644 --- a/src/scripts/bgi/bp.rs +++ b/src/scripts/bgi/bp.rs @@ -116,7 +116,7 @@ impl Script for BGIBpScript { let text_address = i.offset_pos + i.text_offset as usize - 1; // println!("offset: {}, text address: {}, text_offset: {}", i.offset_pos, text_address, i.text_offset); let str = self.data.cpeek_cstring_at(text_address)?; - let str = decode_to_string(self.encoding, str.as_bytes())?; + let str = decode_to_string(self.encoding, str.as_bytes(), true)?; messages.push(Message { name: None, message: str, diff --git a/src/scripts/bgi/bsi.rs b/src/scripts/bgi/bsi.rs index e06acfd..5cb0a9a 100644 --- a/src/scripts/bgi/bsi.rs +++ b/src/scripts/bgi/bsi.rs @@ -66,14 +66,14 @@ impl BGIBsiScript { let section_count = reader.read_u32()?; for _ in 0..section_count { let section_name = reader.read_cstring()?; - let section_name = decode_to_string(encoding, section_name.as_bytes())?; + let section_name = decode_to_string(encoding, section_name.as_bytes(), true)?; let mut section_data = BTreeMap::new(); let entry_count = reader.read_u32()?; for _ in 0..entry_count { let key = reader.read_cstring()?; - let key = decode_to_string(encoding, key.as_bytes())?; + let key = decode_to_string(encoding, key.as_bytes(), true)?; let value = reader.read_cstring()?; - let value = decode_to_string(encoding, value.as_bytes())?; + let value = decode_to_string(encoding, value.as_bytes(), true)?; section_data.insert(key, value); } data.insert(section_name, section_data); @@ -134,7 +134,7 @@ fn create_file<'a>( output_encoding: Encoding, ) -> Result<()> { let input = crate::utils::files::read_file(custom_filename)?; - let s = decode_to_string(output_encoding, &input)?; + let s = decode_to_string(output_encoding, &input, true)?; let data: BTreeMap> = serde_json::from_str(&s) .map_err(|e| anyhow::anyhow!("Failed to read BSI Map data from JSON: {}", e))?; writer.write_u32(data.len() as u32)?; diff --git a/src/scripts/bgi/parser.rs b/src/scripts/bgi/parser.rs index 45c945f..05acf2f 100644 --- a/src/scripts/bgi/parser.rs +++ b/src/scripts/bgi/parser.rs @@ -454,7 +454,8 @@ impl<'a> V1Parser<'a> { pub fn read_string_at_address(&mut self, address: usize) -> Result { let start = self.offset + address; let buf = self.buf.peek_cstring_at(start)?; - Ok(decode_to_string(self.encoding, buf.as_bytes())?) + // Sometimes string has private use area characters, so we disable strict checking + Ok(decode_to_string(self.encoding, buf.as_bytes(), false)?) } pub fn handle_user_function_call(&mut self) -> Result<()> { diff --git a/src/scripts/bgi/script.rs b/src/scripts/bgi/script.rs index 085dee0..bbcbdce 100644 --- a/src/scripts/bgi/script.rs +++ b/src/scripts/bgi/script.rs @@ -17,7 +17,11 @@ impl BGIScriptBuilder { impl ScriptBuilder for BGIScriptBuilder { fn default_encoding(&self) -> Encoding { - Encoding::Cp932 + #[cfg(not(windows))] + return Encoding::Cp932; + #[cfg(windows)] + // Use Windows API first, because encoding-rs does not support PRIVATE USE AREA characters + return Encoding::CodePage(932); } fn build_script( @@ -101,7 +105,8 @@ impl BGIScript { fn read_string(&self, offset: usize) -> Result { let start = self.offset + offset; let string_data = self.data.cpeek_cstring_at(start)?; - let string = decode_to_string(self.encoding, string_data.as_bytes())?; + // sometimes string has private use area characters, so we disable strict checking + let string = decode_to_string(self.encoding, string_data.as_bytes(), false)?; Ok(string) } } diff --git a/src/scripts/cat_system/archive/int.rs b/src/scripts/cat_system/archive/int.rs index ad3047c..0906b7e 100644 --- a/src/scripts/cat_system/archive/int.rs +++ b/src/scripts/cat_system/archive/int.rs @@ -362,7 +362,7 @@ impl CSIntArc { k += 1; i += 1; } - decode_to_string(encoding, &name[..i]) + decode_to_string(encoding, &name[..i], true) } fn get_key(password: &str) -> Result { diff --git a/src/scripts/circus/script.rs b/src/scripts/circus/script.rs index f417b6f..6775efc 100644 --- a/src/scripts/circus/script.rs +++ b/src/scripts/circus/script.rs @@ -198,12 +198,12 @@ impl Script for CircusMesScript { for t in text.iter_mut() { *t = (*t).overflowing_add(self.info.deckey).0; } - t = Some(decode_to_string(self.encoding, &text)?); + t = Some(decode_to_string(self.encoding, &text, true)?); // println!("Token(enc): {:?}, {}", token, t.as_ref().unwrap()); } else if token.value == self.info.optunenc { let text = &self.data[self.asm_bin_offset + token.offset + 1 ..self.asm_bin_offset + token.offset + token.length - 1]; - t = Some(decode_to_string(self.encoding, text)?); + t = Some(decode_to_string(self.encoding, text, true)?); // println!("Token: {:?}, {}", token, t.as_ref().unwrap()); } match t { @@ -236,7 +236,7 @@ impl Script for CircusMesScript { encoding: Encoding, ) -> Result<()> { let jis = encode_string(Encoding::Cp932, s, true)?; - let out = decode_to_string(encoding, &jis)?; + let out = decode_to_string(encoding, &jis, true)?; repls.push((s.to_string(), out)); Ok(()) } diff --git a/src/scripts/escude/archive.rs b/src/scripts/escude/archive.rs index 8ad2b22..defa697 100644 --- a/src/scripts/escude/archive.rs +++ b/src/scripts/escude/archive.rs @@ -251,7 +251,7 @@ impl<'a, T: Iterator, R: Read + Seek> Iterator Ok(name) => name, Err(e) => return Some(Err(e.into())), }; - let name = match decode_to_string(self.archive_encoding, name.as_bytes()) { + let name = match decode_to_string(self.archive_encoding, name.as_bytes(), true) { Ok(name) => name, Err(e) => return Some(Err(e.into())), }; @@ -283,7 +283,7 @@ impl<'a, T: Iterator, R: Read + Seek> Iterator Ok(name) => name, Err(e) => return Some(Err(e.into())), }; - let name = match decode_to_string(self.archive_encoding, name.as_bytes()) { + let name = match decode_to_string(self.archive_encoding, name.as_bytes(), true) { Ok(name) => name, Err(e) => return Some(Err(e.into())), }; diff --git a/src/scripts/escude/list.rs b/src/scripts/escude/list.rs index 7e18490..171a50f 100644 --- a/src/scripts/escude/list.rs +++ b/src/scripts/escude/list.rs @@ -278,7 +278,7 @@ fn create_file<'a>( output_encoding: Encoding, ) -> Result<()> { let input = crate::utils::files::read_file(custom_filename)?; - let s = decode_to_string(output_encoding, &input)?; + let s = decode_to_string(output_encoding, &input, true)?; let entries: Vec = serde_json::from_str(&s) .map_err(|e| anyhow::anyhow!("Failed to read Escude list from JSON: {}", e))?; writer.write_all(b"LIST")?; diff --git a/src/scripts/escude/script.rs b/src/scripts/escude/script.rs index eee64a3..a141f26 100644 --- a/src/scripts/escude/script.rs +++ b/src/scripts/escude/script.rs @@ -109,12 +109,12 @@ impl EscudeBinScript { for _ in 0..string_count { let s = reader.read_cstring()?; let s = replaces.replace(s.as_bytes())?; - strings.push(decode_to_string(encoding, &s)?); + strings.push(decode_to_string(encoding, &s, true)?); } } else { for _ in 0..string_count { let s = reader.read_cstring()?; - strings.push(decode_to_string(encoding, s.as_bytes())?); + strings.push(decode_to_string(encoding, s.as_bytes(), true)?); } } let names = match &config.escude_enum_scr { diff --git a/src/scripts/kirikiri/image/dref.rs b/src/scripts/kirikiri/image/dref.rs index dbcfc2b..5f20cdd 100644 --- a/src/scripts/kirikiri/image/dref.rs +++ b/src/scripts/kirikiri/image/dref.rs @@ -164,7 +164,7 @@ impl Dref { filename: &str, _config: &ExtraConfig, ) -> Result { - let text = decode_with_bom_detect(encoding, &buf)?.0; + let text = decode_with_bom_detect(encoding, &buf, true)?.0; let mut urls = Vec::new(); for text in text.lines() { let text = text.trim(); diff --git a/src/scripts/kirikiri/ks.rs b/src/scripts/kirikiri/ks.rs index 7abefa0..4c1a1dc 100644 --- a/src/scripts/kirikiri/ks.rs +++ b/src/scripts/kirikiri/ks.rs @@ -666,7 +666,7 @@ pub struct KsScript { impl KsScript { pub fn new(reader: Vec, encoding: Encoding, config: &ExtraConfig) -> Result { - let (text, bom) = decode_with_bom_detect(encoding, &reader)?; + let (text, bom) = decode_with_bom_detect(encoding, &reader, true)?; let parser = Parser::new(&text); let tree = parser.parse(!config.kirikiri_remove_empty_lines)?; Ok(Self { diff --git a/src/scripts/kirikiri/scn.rs b/src/scripts/kirikiri/scn.rs index 3b35401..79a8275 100644 --- a/src/scripts/kirikiri/scn.rs +++ b/src/scripts/kirikiri/scn.rs @@ -618,7 +618,7 @@ impl Script for ScnScript { output_encoding: Encoding, ) -> Result<()> { let data = crate::utils::files::read_file(custom_filename)?; - let s = decode_to_string(output_encoding, &data)?; + let s = decode_to_string(output_encoding, &data, true)?; let json = json::parse(&s)?; let mut psb = self.psb.clone(); psb.from_json(&json)?; diff --git a/src/scripts/yaneurao/itufuru/script.rs b/src/scripts/yaneurao/itufuru/script.rs index 009afc3..9a1c2dd 100644 --- a/src/scripts/yaneurao/itufuru/script.rs +++ b/src/scripts/yaneurao/itufuru/script.rs @@ -122,7 +122,7 @@ impl Script for ItufuruScript { for i in self.strings.iter() { let str_pos = i.len_pos + 2; // Skip the length bytes let s = self.data.cpeek_cstring_at(str_pos)?; - let decoded = decode_to_string(self.encoding, s.as_bytes())?; + let decoded = decode_to_string(self.encoding, s.as_bytes(), true)?; messages.push(Message { name: None, message: decoded, diff --git a/src/utils/encoding.rs b/src/utils/encoding.rs index c09e350..6e65e51 100644 --- a/src/utils/encoding.rs +++ b/src/utils/encoding.rs @@ -3,6 +3,7 @@ use crate::types::*; pub fn decode_with_bom_detect( encoding: Encoding, data: &[u8], + check: bool, ) -> Result<(String, BomType), anyhow::Error> { if data.len() >= 2 { if data[0] == 0xFE && data[1] == 0xFF { @@ -34,7 +35,7 @@ pub fn decode_with_bom_detect( if data.len() >= 8 && data.starts_with(b"mdf\0") { let reader = MemReaderRef::new(&data[4..]); let decoded = Mdf::unpack(reader)?; - return decode_with_bom_detect(encoding, &decoded); + return decode_with_bom_detect(encoding, &decoded, check); } if data.len() >= 5 && data[0] == 0xFE @@ -46,38 +47,54 @@ pub fn decode_with_bom_detect( let crypt = data[2]; let reader = MemReaderRef::new(data); let decoded = SimpleCrypt::unpack(crypt, reader)?; - return decode_with_bom_detect(encoding, &decoded); + return decode_with_bom_detect(encoding, &decoded, check); } } - decode_to_string(encoding, data).map(|s| (s, BomType::None)) + decode_to_string(encoding, data, check).map(|s| (s, BomType::None)) } -pub fn decode_to_string(encoding: Encoding, data: &[u8]) -> Result { +pub fn decode_to_string( + encoding: Encoding, + data: &[u8], + check: bool, +) -> Result { match encoding { - Encoding::Auto => decode_to_string(Encoding::Utf8, data) - .or_else(|_| decode_to_string(Encoding::Cp932, data)) - .or_else(|_| decode_to_string(Encoding::Gb2312, data)), + Encoding::Auto => decode_to_string(Encoding::Utf8, data, check) + .or_else(|_| decode_to_string(Encoding::Cp932, data, check)) + .or_else(|_| decode_to_string(Encoding::Gb2312, data, check)), Encoding::Utf8 => Ok(String::from_utf8(data.to_vec())?), Encoding::Cp932 => { let result = encoding_rs::SHIFT_JIS.decode(data); if result.2 { - Err(anyhow::anyhow!("Failed to decode Shift-JIS")) - } else { - Ok(result.0.to_string()) + if check { + return Err(anyhow::anyhow!("Failed to decode Shift-JIS")); + } + eprintln!( + "Warning: Some characters could not be decoded in Shift-JIS: {:?}", + data + ); + crate::COUNTER.inc_warning(); } + Ok(result.0.to_string()) } Encoding::Gb2312 => { let result = encoding_rs::GBK.decode(data); if result.2 { - Err(anyhow::anyhow!("Failed to decode GB2312")) - } else { - Ok(result.0.to_string()) + if check { + return Err(anyhow::anyhow!("Failed to decode GB2312")); + } + eprintln!( + "Warning: Some characters could not be decoded in GB2312: {:?}", + data + ); + crate::COUNTER.inc_warning(); } + Ok(result.0.to_string()) } #[cfg(windows)] - Encoding::CodePage(code_page) => { - Ok(super::encoding_win::decode_to_string(code_page, data)?) - } + Encoding::CodePage(code_page) => Ok(super::encoding_win::decode_to_string( + code_page, data, check, + )?), } } @@ -157,7 +174,8 @@ fn test_decode_to_string() { assert_eq!( decode_to_string( Encoding::Utf8, - &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149] + &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149], + true ) .unwrap(), "中文测试".to_string() @@ -167,19 +185,21 @@ fn test_decode_to_string() { Encoding::Cp932, &[ 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198 - ] + ], + true ) .unwrap(), "きゃべつそふと".to_string() ); assert_eq!( - decode_to_string(Encoding::Gb2312, &[214, 208, 206, 196]).unwrap(), + decode_to_string(Encoding::Gb2312, &[214, 208, 206, 196], true).unwrap(), "中文".to_string() ); assert_eq!( decode_to_string( Encoding::Auto, - &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149] + &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149], + true ) .unwrap(), "中文测试".to_string() @@ -189,14 +209,15 @@ fn test_decode_to_string() { Encoding::Auto, &[ 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198 - ] + ], + true ) .unwrap(), "きゃべつそふと".to_string() ); #[cfg(windows)] assert_eq!( - decode_to_string(Encoding::CodePage(936), &[214, 208, 206, 196]).unwrap(), + decode_to_string(Encoding::CodePage(936), &[214, 208, 206, 196], true).unwrap(), "中文".to_string() ); } @@ -227,21 +248,23 @@ fn test_encode_string() { #[test] fn test_decode_with_bom_detect() { let utf8_data = vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]; - let (decoded_utf8, bom_type) = decode_with_bom_detect(Encoding::Auto, &utf8_data).unwrap(); + let (decoded_utf8, bom_type) = + decode_with_bom_detect(Encoding::Auto, &utf8_data, true).unwrap(); assert_eq!(decoded_utf8, "中文"); assert_eq!(bom_type, BomType::Utf8); let utf16le_data = vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65]; let (decoded_utf16le, bom_type) = - decode_with_bom_detect(Encoding::Auto, &utf16le_data).unwrap(); + decode_with_bom_detect(Encoding::Auto, &utf16le_data, true).unwrap(); assert_eq!(decoded_utf16le, "中文"); assert_eq!(bom_type, BomType::Utf16LE); let utf16be_data = vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87]; let (decoded_utf16be, bom_type) = - decode_with_bom_detect(Encoding::Auto, &utf16be_data).unwrap(); + decode_with_bom_detect(Encoding::Auto, &utf16be_data, true).unwrap(); assert_eq!(decoded_utf16be, "中文"); assert_eq!(bom_type, BomType::Utf16BE); let no_bom_data = vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]; - let (decoded_no_bom, bom_type) = decode_with_bom_detect(Encoding::Auto, &no_bom_data).unwrap(); + let (decoded_no_bom, bom_type) = + decode_with_bom_detect(Encoding::Auto, &no_bom_data, true).unwrap(); assert_eq!(decoded_no_bom, "中文"); assert_eq!(bom_type, BomType::None); #[cfg(feature = "kirikiri")] @@ -251,7 +274,7 @@ fn test_decode_with_bom_detect() { 0x11, 0x00, 0x34, 0x00, 0x36, 0x00, 0x3a, 0x00, 0x11, 0x00, 0x0e, 0x00, 0x05, 0x00, ]; let (decoded_simple_crypt, bom_type) = - decode_with_bom_detect(Encoding::Auto, &simple_crypt_data).unwrap(); + decode_with_bom_detect(Encoding::Auto, &simple_crypt_data, true).unwrap(); assert_eq!(decoded_simple_crypt, "\"895\"\r\n"); assert_eq!(bom_type, BomType::Utf16LE); } diff --git a/src/utils/encoding_win.rs b/src/utils/encoding_win.rs index 336acb0..104a5b8 100644 --- a/src/utils/encoding_win.rs +++ b/src/utils/encoding_win.rs @@ -1,4 +1,4 @@ -use windows_sys::Win32::Foundation::GetLastError; +use windows_sys::Win32::Foundation::{ERROR_NO_UNICODE_TRANSLATION, GetLastError}; use windows_sys::Win32::Globalization::{ CP_UTF7, CP_UTF8, MB_ERR_INVALID_CHARS, MultiByteToWideChar, WideCharToMultiByte, }; @@ -47,14 +47,15 @@ impl std::fmt::Display for WinError { } } -pub fn decode_to_string(cp: u32, data: &[u8]) -> Result { +pub fn decode_to_string(cp: u32, data: &[u8], check: bool) -> Result { if data.is_empty() { return Ok(String::new()); } + let dwflags = if check { MB_ERR_INVALID_CHARS } else { 0 }; let needed_len = unsafe { MultiByteToWideChar( cp, - MB_ERR_INVALID_CHARS, + dwflags, data.as_ptr() as _, data.len() as i32, std::ptr::null_mut(), @@ -64,12 +65,24 @@ pub fn decode_to_string(cp: u32, data: &[u8]) -> Result { if needed_len == 0 { return Err(WinError::from_last_error()); } + let last_error = unsafe { GetLastError() }; + if last_error == ERROR_NO_UNICODE_TRANSLATION { + if check { + return Err(WinError::new(last_error)); + } else { + eprintln!( + "Warning: Some characters could not be decoded in code page {}: {:?}", + cp, data + ); + crate::COUNTER.inc_warning(); + } + } let mut wc = Vec::with_capacity(needed_len as usize); wc.resize(needed_len as usize, 0); let result = unsafe { MultiByteToWideChar( cp, - MB_ERR_INVALID_CHARS, + dwflags, data.as_ptr() as _, data.len() as i32, wc.as_mut_ptr(), @@ -143,7 +156,8 @@ fn test_decode_to_string() { assert_eq!( decode_to_string( 65001, - &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149] + &[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149], + true ) .unwrap(), "中文测试".to_string() @@ -153,13 +167,14 @@ fn test_decode_to_string() { 932, &[ 130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198 - ] + ], + true ) .unwrap(), "きゃべつそふと".to_string() ); assert_eq!( - decode_to_string(936, &[214, 208, 206, 196]).unwrap(), + decode_to_string(936, &[214, 208, 206, 196], true).unwrap(), "中文".to_string() ); }