mirror of
https://github.com/lifegpc/msg-tool.git
synced 2026-06-06 12:58:45 +08:00
Add BOM detect support
This commit is contained in:
10
Cargo.lock
generated
10
Cargo.lock
generated
@@ -428,6 +428,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"unicode-segmentation",
|
||||
"utf16string",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
@@ -622,6 +623,15 @@ version = "1.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
|
||||
|
||||
[[package]]
|
||||
name = "utf16string"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.2"
|
||||
|
||||
@@ -20,6 +20,7 @@ rand = { version = "0.9", optional = true }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
unicode-segmentation = "1.12"
|
||||
utf16string = "0.2"
|
||||
|
||||
[features]
|
||||
default = ["bgi", "bgi-arc", "bgi-img", "cat-system", "cat-system-arc", "cat-system-img", "circus", "escude", "escude-arc", "kirikiri", "yaneurao", "yaneurao-itufuru"]
|
||||
|
||||
19
src/types.rs
19
src/types.rs
@@ -396,3 +396,22 @@ pub struct ImageDataWithName {
|
||||
pub name: String,
|
||||
pub data: ImageData,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, ValueEnum, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum BomType {
|
||||
None,
|
||||
Utf8,
|
||||
Utf16LE,
|
||||
Utf16BE,
|
||||
}
|
||||
|
||||
impl BomType {
|
||||
pub fn as_bytes(&self) -> &'static [u8] {
|
||||
match self {
|
||||
BomType::None => &[],
|
||||
BomType::Utf8 => b"\xEF\xBB\xBF",
|
||||
BomType::Utf16LE => b"\xFF\xFE",
|
||||
BomType::Utf16BE => b"\xFE\xFF",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,51 @@
|
||||
use crate::types::*;
|
||||
|
||||
pub fn decode_with_bom_detect(
|
||||
encoding: Encoding,
|
||||
data: &[u8],
|
||||
) -> Result<(String, BomType), anyhow::Error> {
|
||||
if data.len() >= 2 {
|
||||
if data[0] == 0xFE && data[1] == 0xFF {
|
||||
let result = encoding_rs::UTF_16BE.decode(&data[2..]);
|
||||
if result.2 {
|
||||
return Err(anyhow::anyhow!("Failed to decode UTF-16BE"));
|
||||
} else {
|
||||
return Ok((result.0.into_owned(), BomType::Utf16BE));
|
||||
}
|
||||
} else if data[0] == 0xFF && data[1] == 0xFE {
|
||||
let result = encoding_rs::UTF_16LE.decode(&data[2..]);
|
||||
if result.2 {
|
||||
return Err(anyhow::anyhow!("Failed to decode UTF-16LE"));
|
||||
} else {
|
||||
return Ok((result.0.into_owned(), BomType::Utf16LE));
|
||||
}
|
||||
}
|
||||
}
|
||||
if data.len() >= 3 {
|
||||
if data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
|
||||
return Ok((String::from_utf8(data[3..].to_vec())?, BomType::Utf8));
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "kirikiri")]
|
||||
{
|
||||
use crate::ext::io::*;
|
||||
use crate::scripts::kirikiri::simple_crypt::SimpleCrypt;
|
||||
if data.len() >= 5
|
||||
&& data[0] == 0xFE
|
||||
&& data[1] == 0xFE
|
||||
&& (data[2] == 0 || data[2] == 1 || data[2] == 2)
|
||||
&& data[3] == 0xFF
|
||||
&& data[4] == 0xFE
|
||||
{
|
||||
let crypt = data[2];
|
||||
let reader = MemReaderRef::new(data);
|
||||
let decoded = SimpleCrypt::unpack(crypt, reader)?;
|
||||
return decode_with_bom_detect(encoding, &decoded);
|
||||
}
|
||||
}
|
||||
decode_to_string(encoding, data).map(|s| (s, BomType::None))
|
||||
}
|
||||
|
||||
pub fn decode_to_string(encoding: Encoding, data: &[u8]) -> Result<String, anyhow::Error> {
|
||||
match encoding {
|
||||
Encoding::Auto => decode_to_string(Encoding::Utf8, data)
|
||||
@@ -72,6 +118,34 @@ pub fn encode_string(
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_string_with_bom(
|
||||
encoding: Encoding,
|
||||
data: &str,
|
||||
check: bool,
|
||||
bom: BomType,
|
||||
) -> Result<Vec<u8>, anyhow::Error> {
|
||||
match bom {
|
||||
BomType::None => encode_string(encoding, data, check),
|
||||
BomType::Utf8 => {
|
||||
let mut result = vec![0xEF, 0xBB, 0xBF];
|
||||
result.extend_from_slice(data.as_bytes());
|
||||
Ok(result)
|
||||
}
|
||||
BomType::Utf16LE => {
|
||||
let mut result = vec![0xFF, 0xFE];
|
||||
let re = utf16string::WString::<utf16string::LE>::from(data);
|
||||
result.extend(re.as_bytes());
|
||||
Ok(result)
|
||||
}
|
||||
BomType::Utf16BE => {
|
||||
let mut result = vec![0xFE, 0xFF];
|
||||
let re = utf16string::WString::<utf16string::BE>::from(data);
|
||||
result.extend(re.as_bytes());
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_to_string() {
|
||||
assert_eq!(
|
||||
@@ -143,3 +217,56 @@ fn test_encode_string() {
|
||||
vec![214, 208, 206, 196]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_with_bom_detect() {
|
||||
let utf8_data = vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
|
||||
let (decoded_utf8, bom_type) = decode_with_bom_detect(Encoding::Auto, &utf8_data).unwrap();
|
||||
assert_eq!(decoded_utf8, "中文");
|
||||
assert_eq!(bom_type, BomType::Utf8);
|
||||
let utf16le_data = vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65];
|
||||
let (decoded_utf16le, bom_type) =
|
||||
decode_with_bom_detect(Encoding::Auto, &utf16le_data).unwrap();
|
||||
assert_eq!(decoded_utf16le, "中文");
|
||||
assert_eq!(bom_type, BomType::Utf16LE);
|
||||
let utf16be_data = vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87];
|
||||
let (decoded_utf16be, bom_type) =
|
||||
decode_with_bom_detect(Encoding::Auto, &utf16be_data).unwrap();
|
||||
assert_eq!(decoded_utf16be, "中文");
|
||||
assert_eq!(bom_type, BomType::Utf16BE);
|
||||
let no_bom_data = vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87];
|
||||
let (decoded_no_bom, bom_type) = decode_with_bom_detect(Encoding::Auto, &no_bom_data).unwrap();
|
||||
assert_eq!(decoded_no_bom, "中文");
|
||||
assert_eq!(bom_type, BomType::None);
|
||||
#[cfg(feature = "kirikiri")]
|
||||
{
|
||||
let simple_crypt_data = vec![
|
||||
0xFE, 0xFE, 0x01, 0xFF, 0xFE, // Header
|
||||
0x11, 0x00, 0x34, 0x00, 0x36, 0x00, 0x3a, 0x00, 0x11, 0x00, 0x0e, 0x00, 0x05, 0x00,
|
||||
];
|
||||
let (decoded_simple_crypt, bom_type) =
|
||||
decode_with_bom_detect(Encoding::Auto, &simple_crypt_data).unwrap();
|
||||
assert_eq!(decoded_simple_crypt, "\"895\"\r\n");
|
||||
assert_eq!(bom_type, BomType::Utf16LE);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_string_with_bom() {
|
||||
assert_eq!(
|
||||
encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf8).unwrap(),
|
||||
vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
|
||||
);
|
||||
assert_eq!(
|
||||
encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16LE).unwrap(),
|
||||
vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65]
|
||||
);
|
||||
assert_eq!(
|
||||
encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16BE).unwrap(),
|
||||
vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87]
|
||||
);
|
||||
assert_eq!(
|
||||
encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::None).unwrap(),
|
||||
vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user