From b9ee9d1e429fa94b4ddb015a0df660a6fad84977 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Tue, 12 Aug 2025 22:54:20 +0800 Subject: [PATCH] Add support to decode/encode 0xff in JIS on all platform --- Cargo.lock | 11 +- Cargo.toml | 2 +- src/scripts/will_plus/ws2.rs | 31 +++-- src/scripts/will_plus/ws2_disasm.rs | 16 +-- src/utils/encoding.rs | 180 +++++++++++++++++++++------- 5 files changed, 161 insertions(+), 79 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0b3f883..3d7b592 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -387,15 +387,6 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" -[[package]] -name = "encoding_rs" -version = "0.8.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" -dependencies = [ - "cfg-if", -] - [[package]] name = "equivalent" version = "1.0.2" @@ -814,7 +805,7 @@ dependencies = [ "csv", "ctrlc", "emote-psb", - "encoding_rs", + "encoding", "fancy-regex", "flate2", "int-enum", diff --git a/Cargo.toml b/Cargo.toml index 82a6b32..b36d8fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ clap-num = "1.2" csv = "1.3" ctrlc = "3.4" emote-psb = { version = "0.5", optional = true , features = ["serde"] } -encoding_rs = "0.8" +encoding = "0.2" fancy-regex = { version = "0.16", optional = true } flate2 = { version = "1.1", optional = true } int-enum = { version = "1.2", optional = true } diff --git a/src/scripts/will_plus/ws2.rs b/src/scripts/will_plus/ws2.rs index af4a85f..086717d 100644 --- a/src/scripts/will_plus/ws2.rs +++ b/src/scripts/will_plus/ws2.rs @@ -337,6 +337,7 @@ pub struct Ws2DisasmScript { addresses: Vec, /// Need encrypt when outputting encrypted: bool, + encoding: Encoding, } impl Ws2DisasmScript { @@ -352,13 +353,14 @@ impl Ws2DisasmScript { config: &ExtraConfig, decrypted: bool, ) -> Result { - match disassmble(&buf, encoding) { + match disassmble(&buf) { Ok((addresses, texts)) => { return Ok(Self { data: MemReader::new(buf.to_vec()), texts, addresses, encrypted: decrypted, + encoding, }); } Err(e) => { @@ -389,15 +391,16 @@ impl Script for Ws2DisasmScript { for text in &self.texts { match text.typ { StringType::Name => { - let text = text - .text + let text = decode_to_string(self.encoding, text.text.as_bytes(), false)? .trim_start_matches("%LC") .trim_start_matches("%LF") .to_string(); name = Some(text); } StringType::Message => { - let message = text.text.trim_end_matches("%K%P").to_string(); + let message = decode_to_string(self.encoding, text.text.as_bytes(), false)? + .trim_end_matches("%K%P") + .to_string(); messages.push(Message { message, name: name.take(), @@ -431,11 +434,11 @@ impl Script for Ws2DisasmScript { |s| Ok(s), )?; for s in &self.texts { - let text = match s.typ { + let mut encoded = match s.typ { StringType::Name => { - let prefix = if s.text.starts_with("%LC") { + let prefix = if s.text.as_bytes().starts_with(b"%LC") { "%LC" - } else if s.text.starts_with("%LF") { + } else if s.text.as_bytes().starts_with(b"%LF") { "%LF" } else { "" @@ -456,10 +459,14 @@ impl Script for Ws2DisasmScript { } } name = prefix.to_owned() + &name; - name + encode_string(encoding, &name, false)? } StringType::Message => { - let suffix = if s.text.ends_with("%K%P") { "%K%P" } else { "" }; + let suffix = if s.text.as_bytes().ends_with(b"%K%P") { + "%K%P" + } else { + "" + }; let m = match mess { Some(m) => m, None => { @@ -473,11 +480,11 @@ impl Script for Ws2DisasmScript { } } mess = mes.next(); - message + suffix + message.push_str(suffix); + encode_string(encoding, &message, false)? } - StringType::Internal => s.text.clone(), + StringType::Internal => s.text.as_bytes().to_vec(), }; - let mut encoded = encode_string(encoding, &text, false)?; encoded.push(0); // Null terminator patcher.copy_up_to(s.offset as u64)?; patcher.replace_bytes(s.len as u64, &encoded)?; diff --git a/src/scripts/will_plus/ws2_disasm.rs b/src/scripts/will_plus/ws2_disasm.rs index b17fb82..ac2fc71 100644 --- a/src/scripts/will_plus/ws2_disasm.rs +++ b/src/scripts/will_plus/ws2_disasm.rs @@ -1,8 +1,7 @@ use crate::ext::io::*; -use crate::types::*; -use crate::utils::encoding::*; use anyhow::Result; use std::any::Any; +use std::ffi::CString; pub trait Disasm: Sized { fn disassmble(self) -> Result<(Vec, Vec)>; @@ -36,7 +35,7 @@ pub enum StringType { #[derive(Debug, Clone)] pub struct Ws2DString { - pub text: String, + pub text: CString, pub offset: usize, pub len: usize, pub typ: StringType, @@ -47,17 +46,15 @@ struct DisasmBase<'a> { opers: &'a [(u8, &'static [Oper])], addresses: Vec, texts: Vec, - encoding: Encoding, } impl<'a> DisasmBase<'a> { - pub fn new(data: &'a [u8], opers: &'a [(u8, &'static [Oper])], encoding: Encoding) -> Self { + pub fn new(data: &'a [u8], opers: &'a [(u8, &'static [Oper])]) -> Self { DisasmBase { reader: MemReaderRef::new(data), opers, addresses: Vec::new(), texts: Vec::new(), - encoding, } } @@ -120,10 +117,9 @@ impl<'a> DisasmBase<'a> { S => { let offset = self.reader.pos; let s = self.reader.read_cstring()?; - let decoded = decode_to_string(self.encoding, s.as_bytes(), false)?; let len = s.as_bytes_with_nul().len(); let str = Ws2DString { - text: decoded, + text: s, offset, len, typ: StringType::Internal, @@ -643,9 +639,9 @@ const V3_OPS: [(u8, &'static [Oper]); 165] = [ const OPS: [&[(u8, &'static [Oper])]; 3] = [&V1_OPS, &V2_OPS, &V3_OPS]; -pub fn disassmble(data: &[u8], encoding: Encoding) -> Result<(Vec, Vec)> { +pub fn disassmble(data: &[u8]) -> Result<(Vec, Vec)> { for op in &OPS { - let disasm = DisasmBase::new(data, op, encoding); + let disasm = DisasmBase::new(data, op); match disasm.disassmble() { Ok(result) => return Ok(result), Err(_) => continue, // Try the next version if this one fails diff --git a/src/utils/encoding.rs b/src/utils/encoding.rs index 4642e5c..b9bf92f 100644 --- a/src/utils/encoding.rs +++ b/src/utils/encoding.rs @@ -1,5 +1,8 @@ //! Encoding Utilities +use crate::ext::atomic::*; use crate::types::*; +use encoding::{ByteWriter, DecoderTrap, EncoderTrap, Encoding as EncodingTrait, RawEncoder}; +use std::sync::atomic::AtomicBool; /// Decodes a byte slice to a string using the specified encoding with BOM detection. /// @@ -13,19 +16,33 @@ pub fn decode_with_bom_detect( ) -> Result<(String, BomType), anyhow::Error> { if data.len() >= 2 { if data[0] == 0xFE && data[1] == 0xFF { - let result = encoding_rs::UTF_16BE.decode(&data[2..]); - if result.2 { - return Err(anyhow::anyhow!("Failed to decode UTF-16BE")); - } else { - return Ok((result.0.into_owned(), BomType::Utf16BE)); - } + return Ok(( + encoding::codec::utf_16::UTF_16BE_ENCODING + .decode( + &data[2..], + if check { + DecoderTrap::Strict + } else { + DecoderTrap::Replace + }, + ) + .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16BE"))?, + BomType::Utf16BE, + )); } else if data[0] == 0xFF && data[1] == 0xFE { - let result = encoding_rs::UTF_16LE.decode(&data[2..]); - if result.2 { - return Err(anyhow::anyhow!("Failed to decode UTF-16LE")); - } else { - return Ok((result.0.into_owned(), BomType::Utf16LE)); - } + return Ok(( + encoding::codec::utf_16::UTF_16LE_ENCODING + .decode( + &data[2..], + if check { + DecoderTrap::Strict + } else { + DecoderTrap::Replace + }, + ) + .map_err(|_| anyhow::anyhow!("Failed to decode UTF-16LE"))?, + BomType::Utf16LE, + )); } } if data.len() >= 3 { @@ -73,32 +90,51 @@ pub fn decode_to_string( .or_else(|_| decode_to_string(Encoding::Gb2312, data, check)), Encoding::Utf8 => Ok(String::from_utf8(data.to_vec())?), Encoding::Cp932 => { - let result = encoding_rs::SHIFT_JIS.decode(data); - if result.2 { - if check { - return Err(anyhow::anyhow!("Failed to decode Shift-JIS")); - } + let result = encoding::codec::japanese::Windows31JEncoding + .decode( + data, + if check { + DecoderTrap::Strict + } else { + DecoderTrap::Call(|_, d, out| { + if d.len() == 1 && d[0] == 0xFF { + out.write_char('\u{f8f3}'); // PUA character for U+F8F3 + } else { + out.write_char('\u{FFFD}'); // Replacement character + } + true + }) + }, + ) + .map_err(|_| anyhow::anyhow!("Failed to decode Shift-JIS"))?; + if result.contains('\u{FFFD}') { eprintln!( "Warning: Some characters could not be decoded in Shift-JIS: {:?}", data ); crate::COUNTER.inc_warning(); } - Ok(result.0.to_string()) + Ok(result) } Encoding::Gb2312 => { - let result = encoding_rs::GBK.decode(data); - if result.2 { - if check { - return Err(anyhow::anyhow!("Failed to decode GB2312")); - } + let result = encoding::codec::simpchinese::GBK_ENCODING + .decode( + data, + if check { + DecoderTrap::Strict + } else { + DecoderTrap::Replace + }, + ) + .map_err(|_| anyhow::anyhow!("Failed to decode GB2312"))?; + if result.contains('\u{FFFD}') { eprintln!( "Warning: Some characters could not be decoded in GB2312: {:?}", data ); crate::COUNTER.inc_warning(); } - Ok(result.0.to_string()) + Ok(result) } #[cfg(windows)] Encoding::CodePage(code_page) => Ok(super::encoding_win::decode_to_string( @@ -107,6 +143,26 @@ pub fn decode_to_string( } } +thread_local! { + static ENCODE_REPLACED: AtomicBool = AtomicBool::new(false); +} + +fn jis_encoder_trap(_: &mut dyn RawEncoder, data: &str, out: &mut dyn ByteWriter) -> bool { + if data == "\u{f8f3}" { + out.write_byte(0xFF); // PUA character for U+F8F3 + } else { + out.write_byte(b'?'); // Replacement character + ENCODE_REPLACED.with(|f| f.qsave(true)); + } + true +} + +fn gbk_encoder_trap(_: &mut dyn RawEncoder, _: &str, out: &mut dyn ByteWriter) -> bool { + out.write_byte(b'?'); // Replacement character + ENCODE_REPLACED.with(|f| f.qsave(true)); + true +} + /// Encodes a string to a byte vector using the specified encoding. /// /// * `check` - If true, checks for encoding errors and returns an error if any. @@ -119,32 +175,50 @@ pub fn encode_string( Encoding::Auto => Ok(data.as_bytes().to_vec()), Encoding::Utf8 => Ok(data.as_bytes().to_vec()), Encoding::Cp932 => { - let result = encoding_rs::SHIFT_JIS.encode(data); - if result.2 { - if check { - return Err(anyhow::anyhow!("Failed to encode Shift-JIS")); + ENCODE_REPLACED.with(|f| f.qsave(false)); + let result = encoding::codec::japanese::Windows31JEncoding + .encode( + data, + if check { + EncoderTrap::Strict + } else { + EncoderTrap::Call(jis_encoder_trap) + }, + ) + .map_err(|_| anyhow::anyhow!("Failed to encode Shift-JIS"))?; + ENCODE_REPLACED.with(|f| { + if f.qload() { + eprintln!( + "Warning: Some characters could not be encoded in Shift-JIS: {}", + data + ); + crate::COUNTER.inc_warning(); } - eprintln!( - "Warning: Some characters could not be encoded in Shift-JIS: {}", - data - ); - crate::COUNTER.inc_warning(); - } - Ok(result.0.to_vec()) + }); + Ok(result) } Encoding::Gb2312 => { - let result = encoding_rs::GBK.encode(data); - if result.2 { - if check { - return Err(anyhow::anyhow!("Failed to encode GB2312")); + ENCODE_REPLACED.with(|f| f.qsave(false)); + let result = encoding::codec::simpchinese::GBK_ENCODING + .encode( + data, + if check { + EncoderTrap::Strict + } else { + EncoderTrap::Call(gbk_encoder_trap) + }, + ) + .map_err(|_| anyhow::anyhow!("Failed to encode GB2312"))?; + ENCODE_REPLACED.with(|f| { + if f.qload() { + eprintln!( + "Warning: Some characters could not be encoded in GB2312: {}", + data + ); + crate::COUNTER.inc_warning(); } - eprintln!( - "Warning: Some characters could not be encoded in GB2312: {}", - data - ); - crate::COUNTER.inc_warning(); - } - Ok(result.0.to_vec()) + }); + Ok(result) } #[cfg(windows)] Encoding::CodePage(code_page) => { @@ -315,3 +389,17 @@ fn test_encode_string_with_bom() { vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87] ); } + +#[test] +fn shift_jis_pua_test() { + let ff = [0xFF, 0x01]; + #[cfg(windows)] + assert_eq!( + decode_to_string(Encoding::CodePage(932), &ff, false).unwrap(), + "\u{f8f3}\x01".to_string() + ); + assert_eq!( + decode_to_string(Encoding::Cp932, &ff, false).unwrap(), + "\u{f8f3}\x01".to_string() + ); +}