From 7810e7a569239c165f50b3729a7ddbef13b8b841 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Tue, 1 Jul 2025 23:01:39 +0800 Subject: [PATCH] Add BOM detect support --- Cargo.lock | 10 ++++ Cargo.toml | 1 + src/types.rs | 19 +++++++ src/utils/encoding.rs | 127 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 9ffab84..398d582 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -428,6 +428,7 @@ dependencies = [ "serde", "serde_json", "unicode-segmentation", + "utf16string", "windows-sys", ] @@ -622,6 +623,15 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +[[package]] +name = "utf16string" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216" +dependencies = [ + "byteorder", +] + [[package]] name = "utf8parse" version = "0.2.2" diff --git a/Cargo.toml b/Cargo.toml index fc21d32..3cb6ba8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ rand = { version = "0.9", optional = true } serde = { version = "1", features = ["derive"] } serde_json = "1" unicode-segmentation = "1.12" +utf16string = "0.2" [features] default = ["bgi", "bgi-arc", "bgi-img", "cat-system", "cat-system-arc", "cat-system-img", "circus", "escude", "escude-arc", "kirikiri", "yaneurao", "yaneurao-itufuru"] diff --git a/src/types.rs b/src/types.rs index 019c3c1..67c20b3 100644 --- a/src/types.rs +++ b/src/types.rs @@ -396,3 +396,22 @@ pub struct ImageDataWithName { pub name: String, pub data: ImageData, } + +#[derive(Clone, Copy, Debug, ValueEnum, PartialEq, Eq, PartialOrd, Ord)] +pub enum BomType { + None, + Utf8, + Utf16LE, + Utf16BE, +} + +impl BomType { + pub fn as_bytes(&self) -> &'static [u8] { + match self { + BomType::None => &[], + BomType::Utf8 => b"\xEF\xBB\xBF", + BomType::Utf16LE => b"\xFF\xFE", + BomType::Utf16BE => b"\xFE\xFF", + } + } +} diff --git a/src/utils/encoding.rs b/src/utils/encoding.rs index bd81f64..bbdafde 100644 --- a/src/utils/encoding.rs +++ b/src/utils/encoding.rs @@ -1,5 +1,51 @@ use crate::types::*; +pub fn decode_with_bom_detect( + encoding: Encoding, + data: &[u8], +) -> Result<(String, BomType), anyhow::Error> { + if data.len() >= 2 { + if data[0] == 0xFE && data[1] == 0xFF { + let result = encoding_rs::UTF_16BE.decode(&data[2..]); + if result.2 { + return Err(anyhow::anyhow!("Failed to decode UTF-16BE")); + } else { + return Ok((result.0.into_owned(), BomType::Utf16BE)); + } + } else if data[0] == 0xFF && data[1] == 0xFE { + let result = encoding_rs::UTF_16LE.decode(&data[2..]); + if result.2 { + return Err(anyhow::anyhow!("Failed to decode UTF-16LE")); + } else { + return Ok((result.0.into_owned(), BomType::Utf16LE)); + } + } + } + if data.len() >= 3 { + if data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF { + return Ok((String::from_utf8(data[3..].to_vec())?, BomType::Utf8)); + } + } + #[cfg(feature = "kirikiri")] + { + use crate::ext::io::*; + use crate::scripts::kirikiri::simple_crypt::SimpleCrypt; + if data.len() >= 5 + && data[0] == 0xFE + && data[1] == 0xFE + && (data[2] == 0 || data[2] == 1 || data[2] == 2) + && data[3] == 0xFF + && data[4] == 0xFE + { + let crypt = data[2]; + let reader = MemReaderRef::new(data); + let decoded = SimpleCrypt::unpack(crypt, reader)?; + return decode_with_bom_detect(encoding, &decoded); + } + } + decode_to_string(encoding, data).map(|s| (s, BomType::None)) +} + pub fn decode_to_string(encoding: Encoding, data: &[u8]) -> Result { match encoding { Encoding::Auto => decode_to_string(Encoding::Utf8, data) @@ -72,6 +118,34 @@ pub fn encode_string( } } +pub fn encode_string_with_bom( + encoding: Encoding, + data: &str, + check: bool, + bom: BomType, +) -> Result, anyhow::Error> { + match bom { + BomType::None => encode_string(encoding, data, check), + BomType::Utf8 => { + let mut result = vec![0xEF, 0xBB, 0xBF]; + result.extend_from_slice(data.as_bytes()); + Ok(result) + } + BomType::Utf16LE => { + let mut result = vec![0xFF, 0xFE]; + let re = utf16string::WString::::from(data); + result.extend(re.as_bytes()); + Ok(result) + } + BomType::Utf16BE => { + let mut result = vec![0xFE, 0xFF]; + let re = utf16string::WString::::from(data); + result.extend(re.as_bytes()); + Ok(result) + } + } +} + #[test] fn test_decode_to_string() { assert_eq!( @@ -143,3 +217,56 @@ fn test_encode_string() { vec![214, 208, 206, 196] ); } + +#[test] +fn test_decode_with_bom_detect() { + let utf8_data = vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]; + let (decoded_utf8, bom_type) = decode_with_bom_detect(Encoding::Auto, &utf8_data).unwrap(); + assert_eq!(decoded_utf8, "中文"); + assert_eq!(bom_type, BomType::Utf8); + let utf16le_data = vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65]; + let (decoded_utf16le, bom_type) = + decode_with_bom_detect(Encoding::Auto, &utf16le_data).unwrap(); + assert_eq!(decoded_utf16le, "中文"); + assert_eq!(bom_type, BomType::Utf16LE); + let utf16be_data = vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87]; + let (decoded_utf16be, bom_type) = + decode_with_bom_detect(Encoding::Auto, &utf16be_data).unwrap(); + assert_eq!(decoded_utf16be, "中文"); + assert_eq!(bom_type, BomType::Utf16BE); + let no_bom_data = vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87]; + let (decoded_no_bom, bom_type) = decode_with_bom_detect(Encoding::Auto, &no_bom_data).unwrap(); + assert_eq!(decoded_no_bom, "中文"); + assert_eq!(bom_type, BomType::None); + #[cfg(feature = "kirikiri")] + { + let simple_crypt_data = vec![ + 0xFE, 0xFE, 0x01, 0xFF, 0xFE, // Header + 0x11, 0x00, 0x34, 0x00, 0x36, 0x00, 0x3a, 0x00, 0x11, 0x00, 0x0e, 0x00, 0x05, 0x00, + ]; + let (decoded_simple_crypt, bom_type) = + decode_with_bom_detect(Encoding::Auto, &simple_crypt_data).unwrap(); + assert_eq!(decoded_simple_crypt, "\"895\"\r\n"); + assert_eq!(bom_type, BomType::Utf16LE); + } +} + +#[test] +fn test_encode_string_with_bom() { + assert_eq!( + encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf8).unwrap(), + vec![0xEF, 0xBB, 0xBF, 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87] + ); + assert_eq!( + encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16LE).unwrap(), + vec![0xFF, 0xFE, 0x2D, 0x4E, 0x87, 0x65] + ); + assert_eq!( + encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::Utf16BE).unwrap(), + vec![0xFE, 0xFF, 0x4E, 0x2D, 0x65, 0x87] + ); + assert_eq!( + encode_string_with_bom(Encoding::Utf8, "中文", true, BomType::None).unwrap(), + vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87] + ); +}