mirror of
https://github.com/lifegpc/msg-tool.git
synced 2026-06-06 12:58:45 +08:00
Add support to decode/encode 0xff in JIS on all platform
This commit is contained in:
11
Cargo.lock
generated
11
Cargo.lock
generated
@@ -387,15 +387,6 @@ version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.2"
|
||||
@@ -814,7 +805,7 @@ dependencies = [
|
||||
"csv",
|
||||
"ctrlc",
|
||||
"emote-psb",
|
||||
"encoding_rs",
|
||||
"encoding",
|
||||
"fancy-regex",
|
||||
"flate2",
|
||||
"int-enum",
|
||||
|
||||
@@ -15,7 +15,7 @@ clap-num = "1.2"
|
||||
csv = "1.3"
|
||||
ctrlc = "3.4"
|
||||
emote-psb = { version = "0.5", optional = true , features = ["serde"] }
|
||||
encoding_rs = "0.8"
|
||||
encoding = "0.2"
|
||||
fancy-regex = { version = "0.16", optional = true }
|
||||
flate2 = { version = "1.1", optional = true }
|
||||
int-enum = { version = "1.2", optional = true }
|
||||
|
||||
@@ -337,6 +337,7 @@ pub struct Ws2DisasmScript {
|
||||
addresses: Vec<usize>,
|
||||
/// Need encrypt when outputting
|
||||
encrypted: bool,
|
||||
encoding: Encoding,
|
||||
}
|
||||
|
||||
impl Ws2DisasmScript {
|
||||
@@ -352,13 +353,14 @@ impl Ws2DisasmScript {
|
||||
config: &ExtraConfig,
|
||||
decrypted: bool,
|
||||
) -> Result<Self> {
|
||||
match disassmble(&buf, encoding) {
|
||||
match disassmble(&buf) {
|
||||
Ok((addresses, texts)) => {
|
||||
return Ok(Self {
|
||||
data: MemReader::new(buf.to_vec()),
|
||||
texts,
|
||||
addresses,
|
||||
encrypted: decrypted,
|
||||
encoding,
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -389,15 +391,16 @@ impl Script for Ws2DisasmScript {
|
||||
for text in &self.texts {
|
||||
match text.typ {
|
||||
StringType::Name => {
|
||||
let text = text
|
||||
.text
|
||||
let text = decode_to_string(self.encoding, text.text.as_bytes(), false)?
|
||||
.trim_start_matches("%LC")
|
||||
.trim_start_matches("%LF")
|
||||
.to_string();
|
||||
name = Some(text);
|
||||
}
|
||||
StringType::Message => {
|
||||
let message = text.text.trim_end_matches("%K%P").to_string();
|
||||
let message = decode_to_string(self.encoding, text.text.as_bytes(), false)?
|
||||
.trim_end_matches("%K%P")
|
||||
.to_string();
|
||||
messages.push(Message {
|
||||
message,
|
||||
name: name.take(),
|
||||
@@ -431,11 +434,11 @@ impl Script for Ws2DisasmScript {
|
||||
|s| Ok(s),
|
||||
)?;
|
||||
for s in &self.texts {
|
||||
let text = match s.typ {
|
||||
let mut encoded = match s.typ {
|
||||
StringType::Name => {
|
||||
let prefix = if s.text.starts_with("%LC") {
|
||||
let prefix = if s.text.as_bytes().starts_with(b"%LC") {
|
||||
"%LC"
|
||||
} else if s.text.starts_with("%LF") {
|
||||
} else if s.text.as_bytes().starts_with(b"%LF") {
|
||||
"%LF"
|
||||
} else {
|
||||
""
|
||||
@@ -456,10 +459,14 @@ impl Script for Ws2DisasmScript {
|
||||
}
|
||||
}
|
||||
name = prefix.to_owned() + &name;
|
||||
name
|
||||
encode_string(encoding, &name, false)?
|
||||
}
|
||||
StringType::Message => {
|
||||
let suffix = if s.text.ends_with("%K%P") { "%K%P" } else { "" };
|
||||
let suffix = if s.text.as_bytes().ends_with(b"%K%P") {
|
||||
"%K%P"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
let m = match mess {
|
||||
Some(m) => m,
|
||||
None => {
|
||||
@@ -473,11 +480,11 @@ impl Script for Ws2DisasmScript {
|
||||
}
|
||||
}
|
||||
mess = mes.next();
|
||||
message + suffix
|
||||
message.push_str(suffix);
|
||||
encode_string(encoding, &message, false)?
|
||||
}
|
||||
StringType::Internal => s.text.clone(),
|
||||
StringType::Internal => s.text.as_bytes().to_vec(),
|
||||
};
|
||||
let mut encoded = encode_string(encoding, &text, false)?;
|
||||
encoded.push(0); // Null terminator
|
||||
patcher.copy_up_to(s.offset as u64)?;
|
||||
patcher.replace_bytes(s.len as u64, &encoded)?;
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use crate::ext::io::*;
|
||||
use crate::types::*;
|
||||
use crate::utils::encoding::*;
|
||||
use anyhow::Result;
|
||||
use std::any::Any;
|
||||
use std::ffi::CString;
|
||||
|
||||
pub trait Disasm: Sized {
|
||||
fn disassmble(self) -> Result<(Vec<usize>, Vec<Ws2DString>)>;
|
||||
@@ -36,7 +35,7 @@ pub enum StringType {
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Ws2DString {
|
||||
pub text: String,
|
||||
pub text: CString,
|
||||
pub offset: usize,
|
||||
pub len: usize,
|
||||
pub typ: StringType,
|
||||
@@ -47,17 +46,15 @@ struct DisasmBase<'a> {
|
||||
opers: &'a [(u8, &'static [Oper])],
|
||||
addresses: Vec<usize>,
|
||||
texts: Vec<Ws2DString>,
|
||||
encoding: Encoding,
|
||||
}
|
||||
|
||||
impl<'a> DisasmBase<'a> {
|
||||
pub fn new(data: &'a [u8], opers: &'a [(u8, &'static [Oper])], encoding: Encoding) -> Self {
|
||||
pub fn new(data: &'a [u8], opers: &'a [(u8, &'static [Oper])]) -> Self {
|
||||
DisasmBase {
|
||||
reader: MemReaderRef::new(data),
|
||||
opers,
|
||||
addresses: Vec::new(),
|
||||
texts: Vec::new(),
|
||||
encoding,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,10 +117,9 @@ impl<'a> DisasmBase<'a> {
|
||||
S => {
|
||||
let offset = self.reader.pos;
|
||||
let s = self.reader.read_cstring()?;
|
||||
let decoded = decode_to_string(self.encoding, s.as_bytes(), false)?;
|
||||
let len = s.as_bytes_with_nul().len();
|
||||
let str = Ws2DString {
|
||||
text: decoded,
|
||||
text: s,
|
||||
offset,
|
||||
len,
|
||||
typ: StringType::Internal,
|
||||
@@ -643,9 +639,9 @@ const V3_OPS: [(u8, &'static [Oper]); 165] = [
|
||||
|
||||
const OPS: [&[(u8, &'static [Oper])]; 3] = [&V1_OPS, &V2_OPS, &V3_OPS];
|
||||
|
||||
pub fn disassmble(data: &[u8], encoding: Encoding) -> Result<(Vec<usize>, Vec<Ws2DString>)> {
|
||||
pub fn disassmble(data: &[u8]) -> Result<(Vec<usize>, Vec<Ws2DString>)> {
|
||||
for op in &OPS {
|
||||
let disasm = DisasmBase::new(data, op, encoding);
|
||||
let disasm = DisasmBase::new(data, op);
|
||||
match disasm.disassmble() {
|
||||
Ok(result) => return Ok(result),
|
||||
Err(_) => continue, // Try the next version if this one fails
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
//! Encoding Utilities
|
||||
use crate::ext::atomic::*;
|
||||
use crate::types::*;
|
||||
use encoding::{ByteWriter, DecoderTrap, EncoderTrap, Encoding as EncodingTrait, RawEncoder};
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
/// Decodes a byte slice to a string using the specified encoding with BOM detection.
|
||||
///
|
||||
@@ -13,19 +16,33 @@ pub fn decode_with_bom_detect(
|
||||
) -> Result<(String, BomType), anyhow::Error> {
|
||||
if data.len() >= 2 {
|
||||
if data[0] == 0xFE && data[1] == 0xFF {
|
||||
let result = encoding_rs::UTF_16BE.decode(&data[2..]);
|
||||
if result.2 {
|
||||
return Err(anyhow::anyhow!("Failed to decode UTF-16BE"));
|
||||
} else {
|
||||
return Ok((result.0.into_owned(), BomType::Utf16BE));
|
||||
}
|
||||
return Ok((
|
||||
encoding::codec::utf_16::UTF_16BE_ENCODING
|
||||
.decode(
|
||||
&data[2..],
|
||||
if check {
|
||||
DecoderTrap::Strict
|
||||
} else {
|
||||
DecoderTrap::Replace
|
||||
},
|
||||
)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to decode UTF-16BE"))?,
|
||||
BomType::Utf16BE,
|
||||
));
|
||||
} else if data[0] == 0xFF && data[1] == 0xFE {
|
||||
let result = encoding_rs::UTF_16LE.decode(&data[2..]);
|
||||
if result.2 {
|
||||
return Err(anyhow::anyhow!("Failed to decode UTF-16LE"));
|
||||
} else {
|
||||
return Ok((result.0.into_owned(), BomType::Utf16LE));
|
||||
}
|
||||
return Ok((
|
||||
encoding::codec::utf_16::UTF_16LE_ENCODING
|
||||
.decode(
|
||||
&data[2..],
|
||||
if check {
|
||||
DecoderTrap::Strict
|
||||
} else {
|
||||
DecoderTrap::Replace
|
||||
},
|
||||
)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to decode UTF-16LE"))?,
|
||||
BomType::Utf16LE,
|
||||
));
|
||||
}
|
||||
}
|
||||
if data.len() >= 3 {
|
||||
@@ -73,32 +90,51 @@ pub fn decode_to_string(
|
||||
.or_else(|_| decode_to_string(Encoding::Gb2312, data, check)),
|
||||
Encoding::Utf8 => Ok(String::from_utf8(data.to_vec())?),
|
||||
Encoding::Cp932 => {
|
||||
let result = encoding_rs::SHIFT_JIS.decode(data);
|
||||
if result.2 {
|
||||
if check {
|
||||
return Err(anyhow::anyhow!("Failed to decode Shift-JIS"));
|
||||
}
|
||||
let result = encoding::codec::japanese::Windows31JEncoding
|
||||
.decode(
|
||||
data,
|
||||
if check {
|
||||
DecoderTrap::Strict
|
||||
} else {
|
||||
DecoderTrap::Call(|_, d, out| {
|
||||
if d.len() == 1 && d[0] == 0xFF {
|
||||
out.write_char('\u{f8f3}'); // PUA character for U+F8F3
|
||||
} else {
|
||||
out.write_char('\u{FFFD}'); // Replacement character
|
||||
}
|
||||
true
|
||||
})
|
||||
},
|
||||
)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to decode Shift-JIS"))?;
|
||||
if result.contains('\u{FFFD}') {
|
||||
eprintln!(
|
||||
"Warning: Some characters could not be decoded in Shift-JIS: {:?}",
|
||||
data
|
||||
);
|
||||
crate::COUNTER.inc_warning();
|
||||
}
|
||||
Ok(result.0.to_string())
|
||||
Ok(result)
|
||||
}
|
||||
Encoding::Gb2312 => {
|
||||
let result = encoding_rs::GBK.decode(data);
|
||||
if result.2 {
|
||||
if check {
|
||||
return Err(anyhow::anyhow!("Failed to decode GB2312"));
|
||||
}
|
||||
let result = encoding::codec::simpchinese::GBK_ENCODING
|
||||
.decode(
|
||||
data,
|
||||
if check {
|
||||
DecoderTrap::Strict
|
||||
} else {
|
||||
DecoderTrap::Replace
|
||||
},
|
||||
)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to decode GB2312"))?;
|
||||
if result.contains('\u{FFFD}') {
|
||||
eprintln!(
|
||||
"Warning: Some characters could not be decoded in GB2312: {:?}",
|
||||
data
|
||||
);
|
||||
crate::COUNTER.inc_warning();
|
||||
}
|
||||
Ok(result.0.to_string())
|
||||
Ok(result)
|
||||
}
|
||||
#[cfg(windows)]
|
||||
Encoding::CodePage(code_page) => Ok(super::encoding_win::decode_to_string(
|
||||
@@ -107,6 +143,26 @@ pub fn decode_to_string(
|
||||
}
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
static ENCODE_REPLACED: AtomicBool = AtomicBool::new(false);
|
||||
}
|
||||
|
||||
fn jis_encoder_trap(_: &mut dyn RawEncoder, data: &str, out: &mut dyn ByteWriter) -> bool {
|
||||
if data == "\u{f8f3}" {
|
||||
out.write_byte(0xFF); // PUA character for U+F8F3
|
||||
} else {
|
||||
out.write_byte(b'?'); // Replacement character
|
||||
ENCODE_REPLACED.with(|f| f.qsave(true));
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn gbk_encoder_trap(_: &mut dyn RawEncoder, _: &str, out: &mut dyn ByteWriter) -> bool {
|
||||
out.write_byte(b'?'); // Replacement character
|
||||
ENCODE_REPLACED.with(|f| f.qsave(true));
|
||||
true
|
||||
}
|
||||
|
||||
/// Encodes a string to a byte vector using the specified encoding.
|
||||
///
|
||||
/// * `check` - If true, checks for encoding errors and returns an error if any.
|
||||
@@ -119,32 +175,50 @@ pub fn encode_string(
|
||||
Encoding::Auto => Ok(data.as_bytes().to_vec()),
|
||||
Encoding::Utf8 => Ok(data.as_bytes().to_vec()),
|
||||
Encoding::Cp932 => {
|
||||
let result = encoding_rs::SHIFT_JIS.encode(data);
|
||||
if result.2 {
|
||||
if check {
|
||||
return Err(anyhow::anyhow!("Failed to encode Shift-JIS"));
|
||||
ENCODE_REPLACED.with(|f| f.qsave(false));
|
||||
let result = encoding::codec::japanese::Windows31JEncoding
|
||||
.encode(
|
||||
data,
|
||||
if check {
|
||||
EncoderTrap::Strict
|
||||
} else {
|
||||
EncoderTrap::Call(jis_encoder_trap)
|
||||
},
|
||||
)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to encode Shift-JIS"))?;
|
||||
ENCODE_REPLACED.with(|f| {
|
||||
if f.qload() {
|
||||
eprintln!(
|
||||
"Warning: Some characters could not be encoded in Shift-JIS: {}",
|
||||
data
|
||||
);
|
||||
crate::COUNTER.inc_warning();
|
||||
}
|
||||
eprintln!(
|
||||
"Warning: Some characters could not be encoded in Shift-JIS: {}",
|
||||
data
|
||||
);
|
||||
crate::COUNTER.inc_warning();
|
||||
}
|
||||
Ok(result.0.to_vec())
|
||||
});
|
||||
Ok(result)
|
||||
}
|
||||
Encoding::Gb2312 => {
|
||||
let result = encoding_rs::GBK.encode(data);
|
||||
if result.2 {
|
||||
if check {
|
||||
return Err(anyhow::anyhow!("Failed to encode GB2312"));
|
||||
ENCODE_REPLACED.with(|f| f.qsave(false));
|
||||
let result = encoding::codec::simpchinese::GBK_ENCODING
|
||||
.encode(
|
||||
data,
|
||||
if check {
|
||||
EncoderTrap::Strict
|
||||
} else {
|
||||
EncoderTrap::Call(gbk_encoder_trap)
|
||||
},
|
||||
)
|
||||
.map_err(|_| anyhow::anyhow!("Failed to encode GB2312"))?;
|
||||
ENCODE_REPLACED.with(|f| {
|
||||
if f.qload() {
|
||||
eprintln!(
|
||||
"Warning: Some characters could not be encoded in GB2312: {}",
|
||||
data
|
||||
);
|
||||
crate::COUNTER.inc_warning();
|
||||
}
|
||||
eprintln!(
|
||||
"Warning: Some characters could not be encoded in GB2312: {}",
|
||||
data
|
||||
);
|
||||
crate::COUNTER.inc_warning();
|
||||
}
|
||||
Ok(result.0.to_vec())
|
||||
});
|
||||
Ok(result)
|
||||
}
|
||||
#[cfg(windows)]
|
||||
Encoding::CodePage(code_page) => {
|
||||
@@ -315,3 +389,17 @@ fn test_encode_string_with_bom() {
|
||||
vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shift_jis_pua_test() {
|
||||
let ff = [0xFF, 0x01];
|
||||
#[cfg(windows)]
|
||||
assert_eq!(
|
||||
decode_to_string(Encoding::CodePage(932), &ff, false).unwrap(),
|
||||
"\u{f8f3}\x01".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
decode_to_string(Encoding::Cp932, &ff, false).unwrap(),
|
||||
"\u{f8f3}\x01".to_string()
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user