Files
msg-tool/src/scripts/bgi/archive/dsc.rs

694 lines
21 KiB
Rust

//! Buriko General Interpreter/Ethornell compressed file in archive
use crate::ext::io::*;
use crate::ext::vec::*;
use crate::scripts::base::*;
use crate::types::*;
use crate::utils::bit_stream::*;
use crate::utils::num_range::*;
use anyhow::Result;
use rand::RngExt;
use std::collections::BinaryHeap;
use std::io::{Seek, Write};
#[derive(Debug)]
struct HuffmanCode {
code: u16,
depth: u8,
}
impl std::cmp::PartialEq for HuffmanCode {
fn eq(&self, other: &Self) -> bool {
self.code == other.code && self.depth == other.depth
}
}
impl std::cmp::Eq for HuffmanCode {}
impl std::cmp::PartialOrd for HuffmanCode {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
let cmp = self.depth.cmp(&other.depth);
if cmp == std::cmp::Ordering::Equal {
Some(self.code.cmp(&other.code))
} else {
Some(cmp)
}
}
}
impl std::cmp::Ord for HuffmanCode {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
let cmp = self.depth.cmp(&other.depth);
if cmp == std::cmp::Ordering::Equal {
self.code.cmp(&other.code)
} else {
cmp
}
}
}
#[derive(Clone, Debug)]
struct HuffmanNode {
is_parent: bool,
code: Option<u16>,
left_index: usize,
right_index: usize,
}
/// Decoder for Buriko General Interpreter/Ethornell compressed files (DSC format).
pub struct DscDecoder<'a> {
stream: MsbBitStream<MemReaderRef<'a>>,
key: u32,
magic: u32,
output_size: u32,
dec_count: u32,
}
impl<'a> DscDecoder<'a> {
/// Creates a new DscDecoder from the given data slice.
pub fn new(data: &'a [u8]) -> Result<Self> {
let mut reader = MemReaderRef::new(data);
let magic = (reader.read_u16()? as u32) << 16;
reader.pos = 0x10;
let key = reader.read_u32()?;
let output_size = reader.read_u32()?;
let dec_count = reader.read_u32()?;
let stream = MsbBitStream::new(reader);
Ok(DscDecoder {
stream,
key,
magic,
output_size,
dec_count,
})
}
/// Unpacks the DSC file and returns the decompressed data.
pub fn unpack(mut self) -> Result<Vec<u8>> {
self.stream.m_input.pos = 0x20;
let mut codes = Vec::new();
for i in 0..512 {
let src = self.stream.m_input.read_u8()?;
let depth = src.overflowing_sub(self.update_key()).0;
if depth > 0 {
codes.push(HuffmanCode { code: i, depth })
}
}
codes.sort();
let root = Self::create_huffman_tree(codes);
self.huffman_decompress(root)
}
fn create_huffman_tree(codes: Vec<HuffmanCode>) -> Vec<HuffmanNode> {
let mut trees = Vec::with_capacity(1024);
trees.resize(
1024,
HuffmanNode {
is_parent: false,
code: None,
left_index: 0,
right_index: 0,
},
);
let mut left_index = vec![0usize; 512];
let mut right_index = vec![0usize; 512];
let mut next_node_index = 1usize;
let mut depth_nodes = 1usize;
let mut depth = 0u8;
let mut left_child = true;
let mut n = 0;
while n < codes.len() {
let huffman_node_index = left_child;
left_child = !left_child;
let mut depth_existed_nodes = 0;
while n < codes.len() && codes[n].depth == depth {
let index = if huffman_node_index {
left_index[depth_existed_nodes]
} else {
right_index[depth_existed_nodes]
};
trees[index].code = Some(codes[n].code);
n += 1;
depth_existed_nodes += 1;
}
let depth_nodes_to_create = depth_nodes - depth_existed_nodes;
for i in 0..depth_nodes_to_create {
let index = if huffman_node_index {
left_index[depth_existed_nodes + i]
} else {
right_index[depth_existed_nodes + i]
};
let node = &mut trees[index];
node.is_parent = true;
if left_child {
left_index[i * 2] = next_node_index;
node.left_index = next_node_index;
next_node_index += 1;
left_index[i * 2 + 1] = next_node_index;
node.right_index = next_node_index;
next_node_index += 1;
} else {
right_index[i * 2] = next_node_index;
node.left_index = next_node_index;
next_node_index += 1;
right_index[i * 2 + 1] = next_node_index;
node.right_index = next_node_index;
next_node_index += 1;
}
}
depth += 1;
depth_nodes = depth_nodes_to_create * 2;
}
trees
}
fn huffman_decompress(&mut self, nodes: Vec<HuffmanNode>) -> Result<Vec<u8>> {
let output_size = self.output_size as usize;
let mut output = Vec::with_capacity(output_size);
let mut dst = 0;
output.resize(output_size, 0);
for _ in 0..self.dec_count {
let mut current_node = &nodes[0];
loop {
let bit = self.stream.get_next_bit()?;
if !bit {
current_node = &nodes[current_node.left_index]
} else {
current_node = &nodes[current_node.right_index]
}
if !current_node.is_parent {
break;
}
}
let code = *current_node.code.as_ref().unwrap();
if code >= 256 {
let mut offset = self.stream.get_bits(12)?;
let count = ((code & 0xFF) + 2) as usize;
offset += 2;
output.copy_overlapped(dst - offset as usize, dst, count);
dst += count;
} else {
output[dst] = code as u8;
dst += 1;
}
}
if dst != output_size {
eprintln!(
"Warning: Output size mismatch, expected {}, got {}",
self.output_size, dst
);
crate::COUNTER.inc_warning();
}
Ok(output)
}
fn update_key(&mut self) -> u8 {
let v0 = 20021 * (self.key & 0xffff);
let mut v1 = self.magic | (self.key >> 16);
v1 = v1
.overflowing_mul(20021)
.0
.overflowing_add(self.key.overflowing_mul(346).0)
.0;
v1 = overf::wrapping!(v1 + (v0 >> 16)) & 0xffff;
self.key = (v1 << 16) + (v0 & 0xffff) + 1;
v1 as u8
}
}
#[derive(Debug, Clone, Copy)]
enum LzssOp {
Literal(u8),
Match { len: u16, offset: u16 },
}
#[derive(Debug)]
struct FreqNode {
freq: u32,
symbol: Option<u16>,
left: Option<Box<FreqNode>>,
right: Option<Box<FreqNode>>,
}
impl PartialEq for FreqNode {
fn eq(&self, other: &Self) -> bool {
self.freq == other.freq
}
}
impl Eq for FreqNode {}
impl PartialOrd for FreqNode {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for FreqNode {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
other.freq.cmp(&self.freq)
}
}
fn calculate_huffman_depths(freqs: &[u32]) -> Vec<u8> {
const MAX_DEPTH: u8 = 9;
// 收集所有非零频率的符号
let mut symbols_with_freq: Vec<(u16, u32)> = freqs
.iter()
.enumerate()
.filter_map(|(symbol, &freq)| {
if freq > 0 {
Some((symbol as u16, freq))
} else {
None
}
})
.collect();
let mut depths = vec![0u8; 512];
if symbols_with_freq.is_empty() {
return depths;
}
if symbols_with_freq.len() == 1 {
depths[symbols_with_freq[0].0 as usize] = 1;
return depths;
}
// 使用受限Huffman算法
loop {
let current_depths = build_huffman_tree(&symbols_with_freq);
let max_depth = current_depths.iter().max().copied().unwrap_or(0);
if max_depth <= MAX_DEPTH {
// 将深度映射回原始数组
for &(symbol, _) in &symbols_with_freq {
let symbol_index = symbols_with_freq
.iter()
.position(|(s, _)| *s == symbol)
.unwrap();
depths[symbol as usize] = current_depths[symbol_index];
}
break;
}
// 如果深度超限,调整频率
adjust_frequencies_for_depth_limit(&mut symbols_with_freq);
}
depths
}
fn build_huffman_tree(symbols_with_freq: &[(u16, u32)]) -> Vec<u8> {
let mut heap = BinaryHeap::new();
// 添加所有叶子节点
for &(symbol, freq) in symbols_with_freq {
heap.push(FreqNode {
freq,
symbol: Some(symbol),
left: None,
right: None,
});
}
// 构建Huffman树
while heap.len() > 1 {
let node1 = heap.pop().unwrap();
let node2 = heap.pop().unwrap();
let new_node = FreqNode {
freq: node1.freq + node2.freq,
symbol: None,
left: Some(Box::new(node1)),
right: Some(Box::new(node2)),
};
heap.push(new_node);
}
// 计算深度
let mut depths = vec![0u8; symbols_with_freq.len()];
if let Some(root) = heap.pop() {
calculate_depths(&root, 0, symbols_with_freq, &mut depths);
}
depths
}
fn calculate_depths(
node: &FreqNode,
depth: u8,
symbols_with_freq: &[(u16, u32)],
depths: &mut [u8],
) {
if let Some(symbol) = node.symbol {
let symbol_index = symbols_with_freq
.iter()
.position(|(s, _)| *s == symbol)
.unwrap();
depths[symbol_index] = if depth == 0 { 1 } else { depth };
} else {
if let Some(ref left) = node.left {
calculate_depths(left, depth + 1, symbols_with_freq, depths);
}
if let Some(ref right) = node.right {
calculate_depths(right, depth + 1, symbols_with_freq, depths);
}
}
}
fn adjust_frequencies_for_depth_limit(symbols_with_freq: &mut [(u16, u32)]) {
// 按频率排序
symbols_with_freq.sort_by(|a, b| a.1.cmp(&b.1));
// 使用Package-Merge算法的简化版本
// 这里使用一个启发式方法:增加低频符号的频率
let min_freq = symbols_with_freq[0].1;
let adjustment = (min_freq as f64 * 0.1).max(1.0) as u32;
// 找到频率最低的几个符号并调整它们的频率
let num_to_adjust = (symbols_with_freq.len() / 4).max(1);
for i in 0..num_to_adjust.min(symbols_with_freq.len()) {
symbols_with_freq[i].1 += adjustment;
}
}
fn generate_canonical_codes(depths: &[u8]) -> Vec<Option<(u16, u8)>> {
let mut codes_with_depths = vec![];
for (symbol, &depth) in depths.iter().enumerate() {
if depth > 0 {
codes_with_depths.push((symbol as u16, depth));
}
}
codes_with_depths.sort_by(|a, b| {
let depth_cmp = a.1.cmp(&b.1);
if depth_cmp == std::cmp::Ordering::Equal {
a.0.cmp(&b.0)
} else {
depth_cmp
}
});
let mut huffman_codes = vec![None; 512];
let mut current_code = 0u16;
let mut last_depth = 0u8;
for &(symbol, depth) in &codes_with_depths {
if last_depth != 0 {
current_code <<= depth - last_depth;
}
huffman_codes[symbol as usize] = Some((current_code, depth));
current_code += 1;
last_depth = depth;
}
huffman_codes
}
/// Encoder for Buriko General Interpreter/Ethornell compressed files (DSC format).
pub struct DscEncoder<'a, T: Write + Seek> {
stream: MsbBitWriter<'a, T>,
magic: u32,
key: u32,
dec_count: u32,
min_len: usize,
}
impl<'a, T: Write + Seek> DscEncoder<'a, T> {
/// Creates a new DscEncoder with the given writer and minimum length for LZSS compression.
pub fn new(writer: &'a mut T, min_len: usize) -> Self {
let stream = MsbBitWriter::new(writer);
DscEncoder {
stream,
magic: 0x5344 << 16, // "DS"
key: rand::rng().random(),
dec_count: 0,
min_len,
}
}
/// Packs the given data into the DSC format using LZSS compression.
pub fn pack(mut self, data: &[u8]) -> Result<()> {
// LZSS compression
let mut ops = vec![];
let mut pos = 0;
const MAX_LEN: usize = 257;
const WINDOW_SIZE: usize = 4097;
let mut head: Vec<i32> = vec![-1; 1 << 16];
let mut prev: Vec<i32> = vec![-1; data.len()];
while pos < data.len() {
let max_len = (data.len() - pos).min(MAX_LEN);
let mut best_len = 0;
let mut best_offset = 0;
if max_len >= self.min_len {
let limit = pos.saturating_sub(WINDOW_SIZE);
let key = (data[pos] as u16) << 8 | data[pos + 1] as u16;
let mut match_pos_i32 = head[key as usize];
while match_pos_i32 != -1 {
let match_pos = match_pos_i32 as usize;
if match_pos < limit {
break;
}
if data.get(match_pos + best_len) == data.get(pos + best_len) {
let mut current_len = 0;
for i in 0..max_len {
if data.get(pos + i) != data.get(match_pos + i) {
break;
}
current_len += 1;
}
if current_len > best_len {
best_len = current_len;
best_offset = pos - match_pos;
if best_len >= max_len {
break;
}
}
}
match_pos_i32 = prev[match_pos];
}
}
if best_len >= self.min_len && best_offset >= 2 {
ops.push(LzssOp::Match {
len: best_len as u16,
offset: best_offset as u16,
});
for i in 0..best_len {
if pos + i + 1 < data.len() {
let key = (data[pos + i] as u16) << 8 | data[pos + i + 1] as u16;
let current_pos = pos + i;
prev[current_pos] = head[key as usize];
head[key as usize] = current_pos as i32;
}
}
pos += best_len;
} else {
ops.push(LzssOp::Literal(data[pos]));
if pos + 1 < data.len() {
let key = (data[pos] as u16) << 8 | data[pos + 1] as u16;
prev[pos] = head[key as usize];
head[key as usize] = pos as i32;
}
pos += 1;
}
}
let symbols: Vec<u16> = ops
.iter()
.map(|op| match op {
LzssOp::Literal(byte) => *byte as u16,
LzssOp::Match { len, .. } => 256 + (len - 2),
})
.collect();
self.dec_count = symbols.len() as u32;
let mut freqs = vec![0u32; 512];
for &s in &symbols {
freqs[s as usize] += 1;
}
let depths = calculate_huffman_depths(&freqs);
let huffman_codes = generate_canonical_codes(&depths);
self.stream.writer.write_all(b"DSC FORMAT 1.00\0")?;
self.stream.writer.seek(std::io::SeekFrom::Start(0x10))?;
self.stream.writer.write_u32(self.key)?;
self.stream.writer.write_u32(data.len() as u32)?;
self.stream.writer.write_u32(self.dec_count)?;
self.stream.writer.seek(std::io::SeekFrom::Start(0x20))?;
for depth in depths.iter() {
let key = self.update_key();
self.stream.writer.write_u8(depth.overflowing_add(key).0)?;
}
for op in &ops {
match op {
LzssOp::Literal(byte) => {
let symbol = *byte as u16;
let (code, len) = huffman_codes[symbol as usize].unwrap();
self.stream.put_bits(code as u32, len)?;
}
LzssOp::Match { len, offset } => {
let symbol = 256 + (len - 2);
let (code, huff_len) = huffman_codes[symbol as usize].unwrap();
self.stream.put_bits(code as u32, huff_len)?;
self.stream.put_bits((*offset - 2) as u32, 12)?;
}
}
}
self.stream.flush()?;
Ok(())
}
fn update_key(&mut self) -> u8 {
let v0 = 20021 * (self.key & 0xffff);
let mut v1 = self.magic | (self.key >> 16);
v1 = v1
.overflowing_mul(20021)
.0
.overflowing_add(self.key.overflowing_mul(346).0)
.0;
v1 = (v1 + (v0 >> 16)) & 0xffff;
self.key = (v1 << 16) + (v0 & 0xffff) + 1;
v1 as u8
}
}
#[derive(Debug)]
/// Builder for DSC scripts.
pub struct DscBuilder {}
impl DscBuilder {
/// Creates a new instance of `DscBuilder`.
pub fn new() -> Self {
DscBuilder {}
}
}
impl ScriptBuilder for DscBuilder {
fn default_encoding(&self) -> Encoding {
Encoding::Cp932
}
fn default_archive_encoding(&self) -> Option<Encoding> {
Some(Encoding::Cp932)
}
fn build_script(
&self,
buf: Vec<u8>,
_filename: &str,
_encoding: Encoding,
_archive_encoding: Encoding,
config: &ExtraConfig,
_archive: Option<&Box<dyn Script>>,
) -> Result<Box<dyn Script>> {
Ok(Box::new(Dsc::new(buf, config)?))
}
fn extensions(&self) -> &'static [&'static str] {
&[]
}
fn script_type(&self) -> &'static ScriptType {
&ScriptType::BGIDsc
}
fn is_this_format(&self, _filename: &str, buf: &[u8], buf_len: usize) -> Option<u8> {
if buf_len >= 16 && buf.starts_with(b"DSC FORMAT 1.00\0") {
return Some(255);
}
None
}
fn can_create_file(&self) -> bool {
true
}
fn create_file<'a>(
&'a self,
filename: &'a str,
mut writer: Box<dyn WriteSeek + 'a>,
_encoding: Encoding,
_file_encoding: Encoding,
config: &ExtraConfig,
) -> Result<()> {
let encoder = DscEncoder::new(&mut writer, config.bgi_compress_min_len);
let data = crate::utils::files::read_file(filename)?;
encoder.pack(&data)?;
Ok(())
}
}
#[derive(Debug)]
/// DSC script
pub struct Dsc {
data: Vec<u8>,
min_len: usize,
}
impl Dsc {
/// Creates a new Dsc script
///
/// * `buf` - The buffer containing the DSC data.
/// * `config` - Extra configuration options.
pub fn new(buf: Vec<u8>, config: &ExtraConfig) -> Result<Self> {
if buf.len() < 16 || !buf.starts_with(b"DSC FORMAT 1.00\0") {
return Err(anyhow::anyhow!("Invalid DSC format"));
}
let decoder = DscDecoder::new(&buf)?;
let data = decoder.unpack()?;
Ok(Dsc {
data,
min_len: config.bgi_compress_min_len,
})
}
}
impl Script for Dsc {
fn default_output_script_type(&self) -> OutputScriptType {
OutputScriptType::Custom
}
fn is_output_supported(&self, output: OutputScriptType) -> bool {
matches!(output, OutputScriptType::Custom)
}
fn default_format_type(&self) -> FormatOptions {
FormatOptions::None
}
fn custom_output_extension(&self) -> &'static str {
""
}
fn custom_export(&self, filename: &std::path::Path, _encoding: Encoding) -> Result<()> {
let mut f = std::fs::File::create(filename)?;
f.write_all(&self.data)?;
Ok(())
}
fn custom_import<'a>(
&'a self,
custom_filename: &'a str,
mut file: Box<dyn WriteSeek + 'a>,
_encoding: Encoding,
_output_encoding: Encoding,
) -> Result<()> {
let encoder = DscEncoder::new(&mut file, self.min_len);
let data = crate::utils::files::read_file(custom_filename)?;
encoder.pack(&data)?;
Ok(())
}
}
/// Parses the minimum length for LZSS compression from a string.
pub fn parse_min_length(len: &str) -> Result<usize, String> {
number_range(len, 2, 256)
}