Add zopfli like compression method for BGI DSC file

This commit is contained in:
2026-06-05 10:00:47 +08:00
parent 456dd64c97
commit f211986319
3 changed files with 197 additions and 75 deletions

View File

@@ -319,7 +319,8 @@ pub struct Arg {
pub bgi_compress_file: bool,
#[cfg(feature = "bgi-arc")]
#[arg(long, global = true, default_value_t = 9, value_parser = crate::scripts::bgi::archive::dsc::parse_compress_level)]
/// Compress level for BGI Dsc file. 0 means store, 9 mean best compression.
/// Compress level for BGI Dsc file. 0 means store, 10 mean best compression.
/// 10 will use zopfli like compression method, this may cost a lot of time.
pub bgi_compress_level: u8,
#[cfg(feature = "emote-img")]
#[arg(long, global = true)]

View File

@@ -226,6 +226,7 @@ pub enum MatchMode {
Rle,
NonLazy,
Lazy,
Optimal, // 新增:最优解析模式
}
#[derive(Debug, Clone, Copy)]
@@ -237,7 +238,7 @@ pub struct CompressConfig {
pub mode: MatchMode,
}
pub const COMPRESS_CONFIGS: [CompressConfig; 10] = [
pub const COMPRESS_CONFIGS: [CompressConfig; 11] = [
// 0: Store (No compression)
CompressConfig {
good_length: 0,
@@ -318,6 +319,14 @@ pub const COMPRESS_CONFIGS: [CompressConfig; 10] = [
max_chain: 4096,
mode: MatchMode::Lazy,
},
// 10: Optimal (Zopfli-like) - 穷举所有可能以找到最优解
CompressConfig {
good_length: 258,
max_lazy: 258,
nice_length: 258,
max_chain: 4096,
mode: MatchMode::Optimal,
},
];
/// Computes optimal length-limited Huffman code depths using the Package-Merge algorithm.
@@ -466,7 +475,7 @@ fn find_match(
let src_slice = &data[pos..pos + max_len];
let limit = pos.saturating_sub(4097);
// Level 3~9: 基于哈希字典进行跳跃搜索
// Level 3~10: 基于哈希字典进行跳跃搜索
let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize);
let mut match_pos_i32 = head[key];
let mut chain_length = config.max_chain;
@@ -523,6 +532,112 @@ fn find_match(
}
}
/// Zopfli-like Optimal Parsing
/// 通过多次迭代动态规划,寻找全局最优的 LZSS 匹配路径
fn optimal_parse(data: &[u8], config: &CompressConfig) -> Vec<LzssOp> {
let n = data.len();
if n == 0 {
return vec![];
}
// 预先计算每个位置的最长匹配,避免在 DP 迭代中重复搜索
let mut longest_matches = vec![(0usize, 0usize); n];
let mut head = vec![-1i32; 1 << 16];
let mut prev = vec![-1i32; n];
let insert_limit = n.saturating_sub(1);
for pos in 0..n {
let (best_len, best_offset) = find_match(data, pos, &head, &prev, config);
longest_matches[pos] = (best_len, best_offset);
if pos < insert_limit {
let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize);
prev[pos] = head[key];
head[key] = pos as i32;
}
}
// 初始代价:假设所有符号的 Huffman 编码长度均为 9 bits
let mut sym_costs = vec![9u32; 512];
let mut best_ops = vec![];
const NUM_ITERATIONS: usize = 4;
for iter in 0..NUM_ITERATIONS {
let mut costs = vec![u32::MAX; n + 1];
let mut links = vec![None; n + 1];
costs[0] = 0;
// 动态规划寻找最短路径
for i in 0..n {
let current_cost = costs[i];
if current_cost == u32::MAX {
continue;
}
// 1. 尝试字面量 (Literal)
let lit_sym = data[i] as usize;
let lit_cost = current_cost + sym_costs[lit_sym];
if lit_cost < costs[i + 1] {
costs[i + 1] = lit_cost;
links[i + 1] = Some(LzssOp::Literal(data[i]));
}
// 2. 尝试匹配 (Match)
let (max_len, offset) = longest_matches[i];
for len in 2..=max_len {
let match_sym = 256 + (len - 2);
// 匹配的代价 = 当前代价 + 长度符号的 Huffman 代价 + 固定的 12 bits 偏移量代价
let match_cost = current_cost + sym_costs[match_sym] + 12;
if match_cost < costs[i + len] {
costs[i + len] = match_cost;
links[i + len] = Some(LzssOp::Match {
len: len as u16,
offset: offset as u16,
});
}
}
}
// 回溯构建操作序列
let mut ops = vec![];
let mut curr = n;
while curr > 0 {
let op = links[curr].unwrap();
ops.push(op);
curr -= match op {
LzssOp::Literal(_) => 1,
LzssOp::Match { len, .. } => len as usize,
};
}
ops.reverse();
if iter == NUM_ITERATIONS - 1 {
best_ops = ops;
break;
}
// 统计频率并更新 Huffman 树代价
let mut freqs = vec![0u32; 512];
for op in &ops {
match op {
LzssOp::Literal(b) => freqs[*b as usize] += 1,
LzssOp::Match { len, .. } => freqs[256 + (*len - 2) as usize] += 1,
}
}
let depths = calculate_huffman_depths(&freqs);
for i in 0..512 {
sym_costs[i] = if depths[i] > 0 {
depths[i] as u32
} else {
9 // 对于未使用的符号,赋予一个平均惩罚代价
};
}
}
best_ops
}
/// Encoder for Buriko General Interpreter/Ethornell compressed files (DSC format).
pub struct DscEncoder<'a, T: Write + Seek> {
stream: MsbBitWriter<'a, BufWriter<T>>,
@@ -533,7 +648,7 @@ pub struct DscEncoder<'a, T: Write + Seek> {
}
impl<'a, T: Write + Seek> DscEncoder<'a, T> {
/// Creates a new DscEncoder with the given writer and compression level (0-9).
/// Creates a new DscEncoder with the given writer and compression level (0-10).
pub fn new(writer: &'a mut BufWriter<T>, level: u8) -> Self {
let stream = MsbBitWriter::new(writer);
DscEncoder {
@@ -541,94 +656,99 @@ impl<'a, T: Write + Seek> DscEncoder<'a, T> {
magic: 0x5344 << 16, // "DS"
key: rand::rng().random(),
dec_count: 0,
level: level.min(9),
level: level.min(10),
}
}
/// Packs the given data into the DSC format using configured LZSS compression.
pub fn pack(mut self, data: &[u8]) -> Result<()> {
let mut ops = vec![];
let mut pos = 0;
let config = &COMPRESS_CONFIGS[self.level as usize];
// 预分配哈希表,65536 对应 2 bytes 的所有可能
let mut head = vec![-1i32; 1 << 16];
let mut prev = vec![-1i32; data.len()];
let insert_limit = data.len().saturating_sub(1); // 防止 data[p + 1] 越界
let ops = if config.mode == MatchMode::Optimal {
optimal_parse(data, config)
} else {
let mut ops = vec![];
let mut pos = 0;
// 预分配哈希表,65536 对应 2 bytes 的所有可能
let mut head = vec![-1i32; 1 << 16];
let mut prev = vec![-1i32; data.len()];
let insert_limit = data.len().saturating_sub(1); // 防止 data[p + 1] 越界
while pos < data.len() {
if config.mode == MatchMode::Store {
ops.push(LzssOp::Literal(data[pos]));
pos += 1;
continue;
}
let (match_len, match_offset) = find_match(data, pos, &head, &prev, config);
if match_len >= 2 {
let mut lazy_match = false;
// 延迟匹配逻辑 (Lazy Evaluation)
if config.mode == MatchMode::Lazy
&& match_len <= config.max_lazy
&& pos + 1 < data.len()
{
// 为下一次尝试预先将当前 pos 插入字典
if pos < insert_limit {
let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize);
prev[pos] = head[key];
head[key] = pos as i32;
}
let (next_len, _) = find_match(data, pos + 1, &head, &prev, config);
if next_len > match_len {
lazy_match = true;
}
}
if lazy_match {
while pos < data.len() {
if config.mode == MatchMode::Store {
ops.push(LzssOp::Literal(data[pos]));
pos += 1;
continue;
}
ops.push(LzssOp::Match {
len: match_len as u16,
offset: match_offset as u16,
});
let (match_len, match_offset) = find_match(data, pos, &head, &prev, config);
let start_insert = if config.mode == MatchMode::Lazy
&& match_len <= config.max_lazy
&& pos + 1 < data.len()
{
1 // 如果进行了延迟检查,pos 已被插入,从 1 开始
} else {
0
};
if match_len >= 2 {
let mut lazy_match = false;
// 批量插入字典,使用 usize 强制类型,移除闭包产生的隐式开销
if config.mode != MatchMode::Rle {
for i in start_insert..match_len {
let p = pos + i;
if p < insert_limit {
let key = ((data[p] as usize) << 8) | (data[p + 1] as usize);
prev[p] = head[key];
head[key] = p as i32;
// 延迟匹配逻辑 (Lazy Evaluation)
if config.mode == MatchMode::Lazy
&& match_len <= config.max_lazy
&& pos + 1 < data.len()
{
// 为下一次尝试预先将当前 pos 插入字典
if pos < insert_limit {
let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize);
prev[pos] = head[key];
head[key] = pos as i32;
}
let (next_len, _) = find_match(data, pos + 1, &head, &prev, config);
if next_len > match_len {
lazy_match = true;
}
}
if lazy_match {
ops.push(LzssOp::Literal(data[pos]));
pos += 1;
continue;
}
ops.push(LzssOp::Match {
len: match_len as u16,
offset: match_offset as u16,
});
let start_insert = if config.mode == MatchMode::Lazy
&& match_len <= config.max_lazy
&& pos + 1 < data.len()
{
1 // 如果进行了延迟检查,pos 已被插入,从 1 开始
} else {
0
};
// 批量插入字典,使用 usize 强制类型,移除闭包产生的隐式开销
if config.mode != MatchMode::Rle {
for i in start_insert..match_len {
let p = pos + i;
if p < insert_limit {
let key = ((data[p] as usize) << 8) | (data[p + 1] as usize);
prev[p] = head[key];
head[key] = p as i32;
}
}
}
pos += match_len;
} else {
ops.push(LzssOp::Literal(data[pos]));
if config.mode != MatchMode::Rle && pos < insert_limit {
let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize);
prev[pos] = head[key];
head[key] = pos as i32;
}
pos += 1;
}
pos += match_len;
} else {
ops.push(LzssOp::Literal(data[pos]));
if config.mode != MatchMode::Rle && pos < insert_limit {
let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize);
prev[pos] = head[key];
head[key] = pos as i32;
}
pos += 1;
}
}
ops
};
let symbols: Vec<u16> = ops
.iter()
@@ -824,5 +944,5 @@ impl Script for Dsc {
/// Parses the compression level for LZSS compression from a string.
pub fn parse_compress_level(level: &str) -> Result<u8, String> {
number_range(level, 0, 9).map(|v| v as u8)
number_range(level, 0, 10).map(|v| v as u8)
}

View File

@@ -350,7 +350,8 @@ pub struct ExtraConfig {
pub bgi_compress_file: bool,
#[cfg(feature = "bgi-arc")]
#[default(9)]
/// Compress level for BGI Dsc file. 0 means store, 9 mean best compression.
/// Compress level for BGI Dsc file. 0 means store, 10 mean best compression.
/// 10 will use zopfli like compression method, this may cost a lot of time.
pub bgi_compress_level: u8,
#[cfg(feature = "emote-img")]
/// Whether to overlay PIMG images. (By default, true if all layers are not group layers.)