mirror of
https://github.com/lifegpc/msg-tool.git
synced 2026-06-08 13:58:50 +08:00
Add support to support PUA in BGI string
This commit is contained in:
@@ -3,6 +3,7 @@ use crate::types::*;
|
||||
pub fn decode_with_bom_detect(
|
||||
encoding: Encoding,
|
||||
data: &[u8],
|
||||
check: bool,
|
||||
) -> Result<(String, BomType), anyhow::Error> {
|
||||
if data.len() >= 2 {
|
||||
if data[0] == 0xFE && data[1] == 0xFF {
|
||||
@@ -34,7 +35,7 @@ pub fn decode_with_bom_detect(
|
||||
if data.len() >= 8 && data.starts_with(b"mdf\0") {
|
||||
let reader = MemReaderRef::new(&data[4..]);
|
||||
let decoded = Mdf::unpack(reader)?;
|
||||
return decode_with_bom_detect(encoding, &decoded);
|
||||
return decode_with_bom_detect(encoding, &decoded, check);
|
||||
}
|
||||
if data.len() >= 5
|
||||
&& data[0] == 0xFE
|
||||
@@ -46,38 +47,54 @@ pub fn decode_with_bom_detect(
|
||||
let crypt = data[2];
|
||||
let reader = MemReaderRef::new(data);
|
||||
let decoded = SimpleCrypt::unpack(crypt, reader)?;
|
||||
return decode_with_bom_detect(encoding, &decoded);
|
||||
return decode_with_bom_detect(encoding, &decoded, check);
|
||||
}
|
||||
}
|
||||
decode_to_string(encoding, data).map(|s| (s, BomType::None))
|
||||
decode_to_string(encoding, data, check).map(|s| (s, BomType::None))
|
||||
}
|
||||
|
||||
pub fn decode_to_string(encoding: Encoding, data: &[u8]) -> Result<String, anyhow::Error> {
|
||||
pub fn decode_to_string(
|
||||
encoding: Encoding,
|
||||
data: &[u8],
|
||||
check: bool,
|
||||
) -> Result<String, anyhow::Error> {
|
||||
match encoding {
|
||||
Encoding::Auto => decode_to_string(Encoding::Utf8, data)
|
||||
.or_else(|_| decode_to_string(Encoding::Cp932, data))
|
||||
.or_else(|_| decode_to_string(Encoding::Gb2312, data)),
|
||||
Encoding::Auto => decode_to_string(Encoding::Utf8, data, check)
|
||||
.or_else(|_| decode_to_string(Encoding::Cp932, data, check))
|
||||
.or_else(|_| decode_to_string(Encoding::Gb2312, data, check)),
|
||||
Encoding::Utf8 => Ok(String::from_utf8(data.to_vec())?),
|
||||
Encoding::Cp932 => {
|
||||
let result = encoding_rs::SHIFT_JIS.decode(data);
|
||||
if result.2 {
|
||||
Err(anyhow::anyhow!("Failed to decode Shift-JIS"))
|
||||
} else {
|
||||
Ok(result.0.to_string())
|
||||
if check {
|
||||
return Err(anyhow::anyhow!("Failed to decode Shift-JIS"));
|
||||
}
|
||||
eprintln!(
|
||||
"Warning: Some characters could not be decoded in Shift-JIS: {:?}",
|
||||
data
|
||||
);
|
||||
crate::COUNTER.inc_warning();
|
||||
}
|
||||
Ok(result.0.to_string())
|
||||
}
|
||||
Encoding::Gb2312 => {
|
||||
let result = encoding_rs::GBK.decode(data);
|
||||
if result.2 {
|
||||
Err(anyhow::anyhow!("Failed to decode GB2312"))
|
||||
} else {
|
||||
Ok(result.0.to_string())
|
||||
if check {
|
||||
return Err(anyhow::anyhow!("Failed to decode GB2312"));
|
||||
}
|
||||
eprintln!(
|
||||
"Warning: Some characters could not be decoded in GB2312: {:?}",
|
||||
data
|
||||
);
|
||||
crate::COUNTER.inc_warning();
|
||||
}
|
||||
Ok(result.0.to_string())
|
||||
}
|
||||
#[cfg(windows)]
|
||||
Encoding::CodePage(code_page) => {
|
||||
Ok(super::encoding_win::decode_to_string(code_page, data)?)
|
||||
}
|
||||
Encoding::CodePage(code_page) => Ok(super::encoding_win::decode_to_string(
|
||||
code_page, data, check,
|
||||
)?),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,7 +174,8 @@ fn test_decode_to_string() {
|
||||
assert_eq!(
|
||||
decode_to_string(
|
||||
Encoding::Utf8,
|
||||
&[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149]
|
||||
&[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149],
|
||||
true
|
||||
)
|
||||
.unwrap(),
|
||||
"中文测试".to_string()
|
||||
@@ -167,19 +185,21 @@ fn test_decode_to_string() {
|
||||
Encoding::Cp932,
|
||||
&[
|
||||
130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
|
||||
]
|
||||
],
|
||||
true
|
||||
)
|
||||
.unwrap(),
|
||||
"きゃべつそふと".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
decode_to_string(Encoding::Gb2312, &[214, 208, 206, 196]).unwrap(),
|
||||
decode_to_string(Encoding::Gb2312, &[214, 208, 206, 196], true).unwrap(),
|
||||
"中文".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
decode_to_string(
|
||||
Encoding::Auto,
|
||||
&[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149]
|
||||
&[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149],
|
||||
true
|
||||
)
|
||||
.unwrap(),
|
||||
"中文测试".to_string()
|
||||
@@ -189,14 +209,15 @@ fn test_decode_to_string() {
|
||||
Encoding::Auto,
|
||||
&[
|
||||
130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
|
||||
]
|
||||
],
|
||||
true
|
||||
)
|
||||
.unwrap(),
|
||||
"きゃべつそふと".to_string()
|
||||
);
|
||||
#[cfg(windows)]
|
||||
assert_eq!(
|
||||
decode_to_string(Encoding::CodePage(936), &[214, 208, 206, 196]).unwrap(),
|
||||
decode_to_string(Encoding::CodePage(936), &[214, 208, 206, 196], true).unwrap(),
|
||||
"中文".to_string()
|
||||
);
|
||||
}
|
||||
@@ -227,21 +248,23 @@ fn test_encode_string() {
|
||||
#[test]
|
||||
fn test_decode_with_bom_detect() {
|
||||
let utf8_data = vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
|
||||
let (decoded_utf8, bom_type) = decode_with_bom_detect(Encoding::Auto, &utf8_data).unwrap();
|
||||
let (decoded_utf8, bom_type) =
|
||||
decode_with_bom_detect(Encoding::Auto, &utf8_data, true).unwrap();
|
||||
assert_eq!(decoded_utf8, "中文");
|
||||
assert_eq!(bom_type, BomType::Utf8);
|
||||
let utf16le_data = vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65];
|
||||
let (decoded_utf16le, bom_type) =
|
||||
decode_with_bom_detect(Encoding::Auto, &utf16le_data).unwrap();
|
||||
decode_with_bom_detect(Encoding::Auto, &utf16le_data, true).unwrap();
|
||||
assert_eq!(decoded_utf16le, "中文");
|
||||
assert_eq!(bom_type, BomType::Utf16LE);
|
||||
let utf16be_data = vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87];
|
||||
let (decoded_utf16be, bom_type) =
|
||||
decode_with_bom_detect(Encoding::Auto, &utf16be_data).unwrap();
|
||||
decode_with_bom_detect(Encoding::Auto, &utf16be_data, true).unwrap();
|
||||
assert_eq!(decoded_utf16be, "中文");
|
||||
assert_eq!(bom_type, BomType::Utf16BE);
|
||||
let no_bom_data = vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
|
||||
let (decoded_no_bom, bom_type) = decode_with_bom_detect(Encoding::Auto, &no_bom_data).unwrap();
|
||||
let (decoded_no_bom, bom_type) =
|
||||
decode_with_bom_detect(Encoding::Auto, &no_bom_data, true).unwrap();
|
||||
assert_eq!(decoded_no_bom, "中文");
|
||||
assert_eq!(bom_type, BomType::None);
|
||||
#[cfg(feature = "kirikiri")]
|
||||
@@ -251,7 +274,7 @@ fn test_decode_with_bom_detect() {
|
||||
0x11, 0x00, 0x34, 0x00, 0x36, 0x00, 0x3a, 0x00, 0x11, 0x00, 0x0e, 0x00, 0x05, 0x00,
|
||||
];
|
||||
let (decoded_simple_crypt, bom_type) =
|
||||
decode_with_bom_detect(Encoding::Auto, &simple_crypt_data).unwrap();
|
||||
decode_with_bom_detect(Encoding::Auto, &simple_crypt_data, true).unwrap();
|
||||
assert_eq!(decoded_simple_crypt, "\"895\"\r\n");
|
||||
assert_eq!(bom_type, BomType::Utf16LE);
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use windows_sys::Win32::Foundation::GetLastError;
|
||||
use windows_sys::Win32::Foundation::{ERROR_NO_UNICODE_TRANSLATION, GetLastError};
|
||||
use windows_sys::Win32::Globalization::{
|
||||
CP_UTF7, CP_UTF8, MB_ERR_INVALID_CHARS, MultiByteToWideChar, WideCharToMultiByte,
|
||||
};
|
||||
@@ -47,14 +47,15 @@ impl std::fmt::Display for WinError {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_string(cp: u32, data: &[u8]) -> Result<String, WinError> {
|
||||
pub fn decode_to_string(cp: u32, data: &[u8], check: bool) -> Result<String, WinError> {
|
||||
if data.is_empty() {
|
||||
return Ok(String::new());
|
||||
}
|
||||
let dwflags = if check { MB_ERR_INVALID_CHARS } else { 0 };
|
||||
let needed_len = unsafe {
|
||||
MultiByteToWideChar(
|
||||
cp,
|
||||
MB_ERR_INVALID_CHARS,
|
||||
dwflags,
|
||||
data.as_ptr() as _,
|
||||
data.len() as i32,
|
||||
std::ptr::null_mut(),
|
||||
@@ -64,12 +65,24 @@ pub fn decode_to_string(cp: u32, data: &[u8]) -> Result<String, WinError> {
|
||||
if needed_len == 0 {
|
||||
return Err(WinError::from_last_error());
|
||||
}
|
||||
let last_error = unsafe { GetLastError() };
|
||||
if last_error == ERROR_NO_UNICODE_TRANSLATION {
|
||||
if check {
|
||||
return Err(WinError::new(last_error));
|
||||
} else {
|
||||
eprintln!(
|
||||
"Warning: Some characters could not be decoded in code page {}: {:?}",
|
||||
cp, data
|
||||
);
|
||||
crate::COUNTER.inc_warning();
|
||||
}
|
||||
}
|
||||
let mut wc = Vec::with_capacity(needed_len as usize);
|
||||
wc.resize(needed_len as usize, 0);
|
||||
let result = unsafe {
|
||||
MultiByteToWideChar(
|
||||
cp,
|
||||
MB_ERR_INVALID_CHARS,
|
||||
dwflags,
|
||||
data.as_ptr() as _,
|
||||
data.len() as i32,
|
||||
wc.as_mut_ptr(),
|
||||
@@ -143,7 +156,8 @@ fn test_decode_to_string() {
|
||||
assert_eq!(
|
||||
decode_to_string(
|
||||
65001,
|
||||
&[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149]
|
||||
&[228, 184, 173, 230, 150, 135, 230, 181, 139, 232, 175, 149],
|
||||
true
|
||||
)
|
||||
.unwrap(),
|
||||
"中文测试".to_string()
|
||||
@@ -153,13 +167,14 @@ fn test_decode_to_string() {
|
||||
932,
|
||||
&[
|
||||
130, 171, 130, 225, 130, 215, 130, 194, 130, 187, 130, 211, 130, 198
|
||||
]
|
||||
],
|
||||
true
|
||||
)
|
||||
.unwrap(),
|
||||
"きゃべつそふと".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
decode_to_string(936, &[214, 208, 206, 196]).unwrap(),
|
||||
decode_to_string(936, &[214, 208, 206, 196], true).unwrap(),
|
||||
"中文".to_string()
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user