Add more PUA support for SHIFTJIS

This commit is contained in:
2025-08-13 11:37:15 +08:00
parent 270a2d9fdf
commit 1f34d0458b
2 changed files with 68 additions and 5 deletions

View File

@@ -97,8 +97,16 @@ pub fn decode_to_string(
DecoderTrap::Strict
} else {
DecoderTrap::Call(|_, d, out| {
if d.len() == 1 && d[0] == 0xFF {
out.write_char('\u{f8f3}'); // PUA character for U+F8F3
if d.len() == 1 {
if d[0] == 0xFF {
out.write_char('\u{f8f3}'); // PUA character for U+F8F3
} else if d[0] == 0xFE {
out.write_char('\u{f8f2}'); // PUA character for U+F8F2
} else if d[0] == 0xFD {
out.write_char('\u{f8f1}'); // PUA character for U+F8F1
} else {
out.write_char('\u{FFFD}'); // Replacement character
}
} else {
out.write_char('\u{FFFD}'); // Replacement character
}
@@ -150,6 +158,10 @@ thread_local! {
fn jis_encoder_trap(_: &mut dyn RawEncoder, data: &str, out: &mut dyn ByteWriter) -> bool {
if data == "\u{f8f3}" {
out.write_byte(0xFF); // PUA character for U+F8F3
} else if data == "\u{f8f2}" {
out.write_byte(0xFE); // PUA character for U+F8F2
} else if data == "\u{f8f1}" {
out.write_byte(0xFD); // PUA character for U+F8F1
} else {
out.write_byte(b'?'); // Replacement character
ENCODE_REPLACED.with(|f| f.qsave(true));
@@ -402,4 +414,49 @@ fn shift_jis_pua_test() {
decode_to_string(Encoding::Cp932, &ff, false).unwrap(),
"\u{f8f3}\x01".to_string()
);
#[cfg(windows)]
assert!(decode_to_string(Encoding::CodePage(932), &ff, true).is_err());
assert!(decode_to_string(Encoding::Cp932, &ff, true).is_err());
let fe = [0xFE, 0x01];
#[cfg(windows)]
assert_eq!(
decode_to_string(Encoding::CodePage(932), &fe, false).unwrap(),
"\u{f8f2}\x01".to_string()
);
assert_eq!(
decode_to_string(Encoding::Cp932, &fe, false).unwrap(),
"\u{f8f2}\x01".to_string()
);
#[cfg(windows)]
assert!(decode_to_string(Encoding::CodePage(932), &fe, true).is_err());
assert!(decode_to_string(Encoding::Cp932, &fe, true).is_err());
let fd = [0xFD, 0x01];
#[cfg(windows)]
assert_eq!(
decode_to_string(Encoding::CodePage(932), &fd, false).unwrap(),
"\u{f8f1}\x01".to_string()
);
assert_eq!(
decode_to_string(Encoding::Cp932, &fd, false).unwrap(),
"\u{f8f1}\x01".to_string()
);
#[cfg(windows)]
assert!(decode_to_string(Encoding::CodePage(932), &fd, true).is_err());
assert!(decode_to_string(Encoding::Cp932, &fd, true).is_err());
let ff = "\u{f8f3}\x01";
#[cfg(windows)]
assert_eq!(
encode_string(Encoding::CodePage(932), ff, false).unwrap(),
vec![0xFF, 0x01]
);
assert_eq!(
encode_string(Encoding::Cp932, ff, false).unwrap(),
vec![0xFF, 0x01]
);
#[cfg(windows)]
assert_eq!(
encode_string(Encoding::CodePage(932), ff, true).unwrap(),
vec![0xFF, 0x01]
);
assert!(encode_string(Encoding::Cp932, ff, true).is_err());
}

View File

@@ -1,6 +1,7 @@
use windows_sys::Win32::Foundation::{ERROR_NO_UNICODE_TRANSLATION, GetLastError};
use windows_sys::Win32::Globalization::{
CP_UTF7, CP_UTF8, MB_ERR_INVALID_CHARS, MultiByteToWideChar, WideCharToMultiByte,
CP_UTF7, CP_UTF8, MB_ERR_INVALID_CHARS, MultiByteToWideChar, WC_ERR_INVALID_CHARS,
WideCharToMultiByte,
};
use windows_sys::Win32::System::Diagnostics::Debug::{
FORMAT_MESSAGE_FROM_SYSTEM, FORMAT_MESSAGE_IGNORE_INSERTS, FormatMessageW,
@@ -99,11 +100,16 @@ pub fn encode_string(cp: u32, data: &str, check: bool) -> Result<Vec<u8>, WinErr
if data.is_empty() {
return Ok(Vec::new());
}
let dwflags = if check && cp == 65001 {
WC_ERR_INVALID_CHARS
} else {
0
};
let wstr = data.encode_utf16().collect::<Vec<u16>>();
let needed_len = unsafe {
WideCharToMultiByte(
cp,
0,
dwflags,
wstr.as_ptr(),
wstr.len() as i32,
std::ptr::null_mut(),
@@ -121,7 +127,7 @@ pub fn encode_string(cp: u32, data: &str, check: bool) -> Result<Vec<u8>, WinErr
let result = unsafe {
WideCharToMultiByte(
cp,
0,
dwflags,
wstr.as_ptr(),
wstr.len() as i32,
mb.as_mut_ptr(),