Add support to decode/encode 0xff in JIS on all platform

This commit is contained in:
2025-08-12 22:54:20 +08:00
parent 9d184bd2d6
commit b9ee9d1e42
5 changed files with 161 additions and 79 deletions

11
Cargo.lock generated
View File

@@ -387,15 +387,6 @@ version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
[[package]]
name = "encoding_rs"
version = "0.8.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
dependencies = [
"cfg-if",
]
[[package]]
name = "equivalent"
version = "1.0.2"
@@ -814,7 +805,7 @@ dependencies = [
"csv",
"ctrlc",
"emote-psb",
"encoding_rs",
"encoding",
"fancy-regex",
"flate2",
"int-enum",

View File

@@ -15,7 +15,7 @@ clap-num = "1.2"
csv = "1.3"
ctrlc = "3.4"
emote-psb = { version = "0.5", optional = true , features = ["serde"] }
encoding_rs = "0.8"
encoding = "0.2"
fancy-regex = { version = "0.16", optional = true }
flate2 = { version = "1.1", optional = true }
int-enum = { version = "1.2", optional = true }

View File

@@ -337,6 +337,7 @@ pub struct Ws2DisasmScript {
addresses: Vec<usize>,
/// Need encrypt when outputting
encrypted: bool,
encoding: Encoding,
}
impl Ws2DisasmScript {
@@ -352,13 +353,14 @@ impl Ws2DisasmScript {
config: &ExtraConfig,
decrypted: bool,
) -> Result<Self> {
match disassmble(&buf, encoding) {
match disassmble(&buf) {
Ok((addresses, texts)) => {
return Ok(Self {
data: MemReader::new(buf.to_vec()),
texts,
addresses,
encrypted: decrypted,
encoding,
});
}
Err(e) => {
@@ -389,15 +391,16 @@ impl Script for Ws2DisasmScript {
for text in &self.texts {
match text.typ {
StringType::Name => {
let text = text
.text
let text = decode_to_string(self.encoding, text.text.as_bytes(), false)?
.trim_start_matches("%LC")
.trim_start_matches("%LF")
.to_string();
name = Some(text);
}
StringType::Message => {
let message = text.text.trim_end_matches("%K%P").to_string();
let message = decode_to_string(self.encoding, text.text.as_bytes(), false)?
.trim_end_matches("%K%P")
.to_string();
messages.push(Message {
message,
name: name.take(),
@@ -431,11 +434,11 @@ impl Script for Ws2DisasmScript {
|s| Ok(s),
)?;
for s in &self.texts {
let text = match s.typ {
let mut encoded = match s.typ {
StringType::Name => {
let prefix = if s.text.starts_with("%LC") {
let prefix = if s.text.as_bytes().starts_with(b"%LC") {
"%LC"
} else if s.text.starts_with("%LF") {
} else if s.text.as_bytes().starts_with(b"%LF") {
"%LF"
} else {
""
@@ -456,10 +459,14 @@ impl Script for Ws2DisasmScript {
}
}
name = prefix.to_owned() + &name;
name
encode_string(encoding, &name, false)?
}
StringType::Message => {
let suffix = if s.text.ends_with("%K%P") { "%K%P" } else { "" };
let suffix = if s.text.as_bytes().ends_with(b"%K%P") {
"%K%P"
} else {
""
};
let m = match mess {
Some(m) => m,
None => {
@@ -473,11 +480,11 @@ impl Script for Ws2DisasmScript {
}
}
mess = mes.next();
message + suffix
message.push_str(suffix);
encode_string(encoding, &message, false)?
}
StringType::Internal => s.text.clone(),
StringType::Internal => s.text.as_bytes().to_vec(),
};
let mut encoded = encode_string(encoding, &text, false)?;
encoded.push(0); // Null terminator
patcher.copy_up_to(s.offset as u64)?;
patcher.replace_bytes(s.len as u64, &encoded)?;

View File

@@ -1,8 +1,7 @@
use crate::ext::io::*;
use crate::types::*;
use crate::utils::encoding::*;
use anyhow::Result;
use std::any::Any;
use std::ffi::CString;
pub trait Disasm: Sized {
fn disassmble(self) -> Result<(Vec<usize>, Vec<Ws2DString>)>;
@@ -36,7 +35,7 @@ pub enum StringType {
#[derive(Debug, Clone)]
pub struct Ws2DString {
pub text: String,
pub text: CString,
pub offset: usize,
pub len: usize,
pub typ: StringType,
@@ -47,17 +46,15 @@ struct DisasmBase<'a> {
opers: &'a [(u8, &'static [Oper])],
addresses: Vec<usize>,
texts: Vec<Ws2DString>,
encoding: Encoding,
}
impl<'a> DisasmBase<'a> {
pub fn new(data: &'a [u8], opers: &'a [(u8, &'static [Oper])], encoding: Encoding) -> Self {
pub fn new(data: &'a [u8], opers: &'a [(u8, &'static [Oper])]) -> Self {
DisasmBase {
reader: MemReaderRef::new(data),
opers,
addresses: Vec::new(),
texts: Vec::new(),
encoding,
}
}
@@ -120,10 +117,9 @@ impl<'a> DisasmBase<'a> {
S => {
let offset = self.reader.pos;
let s = self.reader.read_cstring()?;
let decoded = decode_to_string(self.encoding, s.as_bytes(), false)?;
let len = s.as_bytes_with_nul().len();
let str = Ws2DString {
text: decoded,
text: s,
offset,
len,
typ: StringType::Internal,
@@ -643,9 +639,9 @@ const V3_OPS: [(u8, &'static [Oper]); 165] = [
const OPS: [&[(u8, &'static [Oper])]; 3] = [&V1_OPS, &V2_OPS, &V3_OPS];
pub fn disassmble(data: &[u8], encoding: Encoding) -> Result<(Vec<usize>, Vec<Ws2DString>)> {
pub fn disassmble(data: &[u8]) -> Result<(Vec<usize>, Vec<Ws2DString>)> {
for op in &OPS {
let disasm = DisasmBase::new(data, op, encoding);
let disasm = DisasmBase::new(data, op);
match disasm.disassmble() {
Ok(result) => return Ok(result),
Err(_) => continue, // Try the next version if this one fails

View File

@@ -1,5 +1,8 @@
//! Encoding Utilities
use crate::ext::atomic::*;
use crate::types::*;
use encoding::{ByteWriter, DecoderTrap, EncoderTrap, Encoding as EncodingTrait, RawEncoder};
use std::sync::atomic::AtomicBool;
/// Decodes a byte slice to a string using the specified encoding with BOM detection.
///
@@ -13,19 +16,33 @@ pub fn decode_with_bom_detect(
) -> Result<(String, BomType), anyhow::Error> {
if data.len() >= 2 {
if data[0] == 0xFE && data[1] == 0xFF {
let result = encoding_rs::UTF_16BE.decode(&data[2..]);
if result.2 {
return Err(anyhow::anyhow!("Failed to decode UTF-16BE"));
} else {
return Ok((result.0.into_owned(), BomType::Utf16BE));
}
return Ok((
encoding::codec::utf_16::UTF_16BE_ENCODING
.decode(
&data[2..],
if check {
DecoderTrap::Strict
} else {
DecoderTrap::Replace
},
)
.map_err(|_| anyhow::anyhow!("Failed to decode UTF-16BE"))?,
BomType::Utf16BE,
));
} else if data[0] == 0xFF && data[1] == 0xFE {
let result = encoding_rs::UTF_16LE.decode(&data[2..]);
if result.2 {
return Err(anyhow::anyhow!("Failed to decode UTF-16LE"));
} else {
return Ok((result.0.into_owned(), BomType::Utf16LE));
}
return Ok((
encoding::codec::utf_16::UTF_16LE_ENCODING
.decode(
&data[2..],
if check {
DecoderTrap::Strict
} else {
DecoderTrap::Replace
},
)
.map_err(|_| anyhow::anyhow!("Failed to decode UTF-16LE"))?,
BomType::Utf16LE,
));
}
}
if data.len() >= 3 {
@@ -73,32 +90,51 @@ pub fn decode_to_string(
.or_else(|_| decode_to_string(Encoding::Gb2312, data, check)),
Encoding::Utf8 => Ok(String::from_utf8(data.to_vec())?),
Encoding::Cp932 => {
let result = encoding_rs::SHIFT_JIS.decode(data);
if result.2 {
if check {
return Err(anyhow::anyhow!("Failed to decode Shift-JIS"));
}
let result = encoding::codec::japanese::Windows31JEncoding
.decode(
data,
if check {
DecoderTrap::Strict
} else {
DecoderTrap::Call(|_, d, out| {
if d.len() == 1 && d[0] == 0xFF {
out.write_char('\u{f8f3}'); // PUA character for U+F8F3
} else {
out.write_char('\u{FFFD}'); // Replacement character
}
true
})
},
)
.map_err(|_| anyhow::anyhow!("Failed to decode Shift-JIS"))?;
if result.contains('\u{FFFD}') {
eprintln!(
"Warning: Some characters could not be decoded in Shift-JIS: {:?}",
data
);
crate::COUNTER.inc_warning();
}
Ok(result.0.to_string())
Ok(result)
}
Encoding::Gb2312 => {
let result = encoding_rs::GBK.decode(data);
if result.2 {
if check {
return Err(anyhow::anyhow!("Failed to decode GB2312"));
}
let result = encoding::codec::simpchinese::GBK_ENCODING
.decode(
data,
if check {
DecoderTrap::Strict
} else {
DecoderTrap::Replace
},
)
.map_err(|_| anyhow::anyhow!("Failed to decode GB2312"))?;
if result.contains('\u{FFFD}') {
eprintln!(
"Warning: Some characters could not be decoded in GB2312: {:?}",
data
);
crate::COUNTER.inc_warning();
}
Ok(result.0.to_string())
Ok(result)
}
#[cfg(windows)]
Encoding::CodePage(code_page) => Ok(super::encoding_win::decode_to_string(
@@ -107,6 +143,26 @@ pub fn decode_to_string(
}
}
thread_local! {
static ENCODE_REPLACED: AtomicBool = AtomicBool::new(false);
}
fn jis_encoder_trap(_: &mut dyn RawEncoder, data: &str, out: &mut dyn ByteWriter) -> bool {
if data == "\u{f8f3}" {
out.write_byte(0xFF); // PUA character for U+F8F3
} else {
out.write_byte(b'?'); // Replacement character
ENCODE_REPLACED.with(|f| f.qsave(true));
}
true
}
fn gbk_encoder_trap(_: &mut dyn RawEncoder, _: &str, out: &mut dyn ByteWriter) -> bool {
out.write_byte(b'?'); // Replacement character
ENCODE_REPLACED.with(|f| f.qsave(true));
true
}
/// Encodes a string to a byte vector using the specified encoding.
///
/// * `check` - If true, checks for encoding errors and returns an error if any.
@@ -119,32 +175,50 @@ pub fn encode_string(
Encoding::Auto => Ok(data.as_bytes().to_vec()),
Encoding::Utf8 => Ok(data.as_bytes().to_vec()),
Encoding::Cp932 => {
let result = encoding_rs::SHIFT_JIS.encode(data);
if result.2 {
if check {
return Err(anyhow::anyhow!("Failed to encode Shift-JIS"));
ENCODE_REPLACED.with(|f| f.qsave(false));
let result = encoding::codec::japanese::Windows31JEncoding
.encode(
data,
if check {
EncoderTrap::Strict
} else {
EncoderTrap::Call(jis_encoder_trap)
},
)
.map_err(|_| anyhow::anyhow!("Failed to encode Shift-JIS"))?;
ENCODE_REPLACED.with(|f| {
if f.qload() {
eprintln!(
"Warning: Some characters could not be encoded in Shift-JIS: {}",
data
);
crate::COUNTER.inc_warning();
}
eprintln!(
"Warning: Some characters could not be encoded in Shift-JIS: {}",
data
);
crate::COUNTER.inc_warning();
}
Ok(result.0.to_vec())
});
Ok(result)
}
Encoding::Gb2312 => {
let result = encoding_rs::GBK.encode(data);
if result.2 {
if check {
return Err(anyhow::anyhow!("Failed to encode GB2312"));
ENCODE_REPLACED.with(|f| f.qsave(false));
let result = encoding::codec::simpchinese::GBK_ENCODING
.encode(
data,
if check {
EncoderTrap::Strict
} else {
EncoderTrap::Call(gbk_encoder_trap)
},
)
.map_err(|_| anyhow::anyhow!("Failed to encode GB2312"))?;
ENCODE_REPLACED.with(|f| {
if f.qload() {
eprintln!(
"Warning: Some characters could not be encoded in GB2312: {}",
data
);
crate::COUNTER.inc_warning();
}
eprintln!(
"Warning: Some characters could not be encoded in GB2312: {}",
data
);
crate::COUNTER.inc_warning();
}
Ok(result.0.to_vec())
});
Ok(result)
}
#[cfg(windows)]
Encoding::CodePage(code_page) => {
@@ -315,3 +389,17 @@ fn test_encode_string_with_bom() {
vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
);
}
#[test]
fn shift_jis_pua_test() {
let ff = [0xFF, 0x01];
#[cfg(windows)]
assert_eq!(
decode_to_string(Encoding::CodePage(932), &ff, false).unwrap(),
"\u{f8f3}\x01".to_string()
);
assert_eq!(
decode_to_string(Encoding::Cp932, &ff, false).unwrap(),
"\u{f8f3}\x01".to_string()
);
}