From f211986319af43a0e17daf847b9a0375884467f5 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Fri, 5 Jun 2026 10:00:47 +0800 Subject: [PATCH] Add zopfli like compression method for BGI DSC file --- src/args.rs | 3 +- src/scripts/bgi/archive/dsc.rs | 266 ++++++++++++++++++++++++--------- src/types.rs | 3 +- 3 files changed, 197 insertions(+), 75 deletions(-) diff --git a/src/args.rs b/src/args.rs index f361531..3b1a34e 100644 --- a/src/args.rs +++ b/src/args.rs @@ -319,7 +319,8 @@ pub struct Arg { pub bgi_compress_file: bool, #[cfg(feature = "bgi-arc")] #[arg(long, global = true, default_value_t = 9, value_parser = crate::scripts::bgi::archive::dsc::parse_compress_level)] - /// Compress level for BGI Dsc file. 0 means store, 9 mean best compression. + /// Compress level for BGI Dsc file. 0 means store, 10 mean best compression. + /// 10 will use zopfli like compression method, this may cost a lot of time. pub bgi_compress_level: u8, #[cfg(feature = "emote-img")] #[arg(long, global = true)] diff --git a/src/scripts/bgi/archive/dsc.rs b/src/scripts/bgi/archive/dsc.rs index 2da651a..453a5bb 100644 --- a/src/scripts/bgi/archive/dsc.rs +++ b/src/scripts/bgi/archive/dsc.rs @@ -226,6 +226,7 @@ pub enum MatchMode { Rle, NonLazy, Lazy, + Optimal, // 新增:最优解析模式 } #[derive(Debug, Clone, Copy)] @@ -237,7 +238,7 @@ pub struct CompressConfig { pub mode: MatchMode, } -pub const COMPRESS_CONFIGS: [CompressConfig; 10] = [ +pub const COMPRESS_CONFIGS: [CompressConfig; 11] = [ // 0: Store (No compression) CompressConfig { good_length: 0, @@ -318,6 +319,14 @@ pub const COMPRESS_CONFIGS: [CompressConfig; 10] = [ max_chain: 4096, mode: MatchMode::Lazy, }, + // 10: Optimal (Zopfli-like) - 穷举所有可能以找到最优解 + CompressConfig { + good_length: 258, + max_lazy: 258, + nice_length: 258, + max_chain: 4096, + mode: MatchMode::Optimal, + }, ]; /// Computes optimal length-limited Huffman code depths using the Package-Merge algorithm. @@ -466,7 +475,7 @@ fn find_match( let src_slice = &data[pos..pos + max_len]; let limit = pos.saturating_sub(4097); - // Level 3~9: 基于哈希字典进行跳跃搜索 + // Level 3~10: 基于哈希字典进行跳跃搜索 let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize); let mut match_pos_i32 = head[key]; let mut chain_length = config.max_chain; @@ -523,6 +532,112 @@ fn find_match( } } +/// Zopfli-like Optimal Parsing +/// 通过多次迭代动态规划,寻找全局最优的 LZSS 匹配路径 +fn optimal_parse(data: &[u8], config: &CompressConfig) -> Vec { + let n = data.len(); + if n == 0 { + return vec![]; + } + + // 预先计算每个位置的最长匹配,避免在 DP 迭代中重复搜索 + let mut longest_matches = vec![(0usize, 0usize); n]; + let mut head = vec![-1i32; 1 << 16]; + let mut prev = vec![-1i32; n]; + let insert_limit = n.saturating_sub(1); + + for pos in 0..n { + let (best_len, best_offset) = find_match(data, pos, &head, &prev, config); + longest_matches[pos] = (best_len, best_offset); + + if pos < insert_limit { + let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize); + prev[pos] = head[key]; + head[key] = pos as i32; + } + } + + // 初始代价:假设所有符号的 Huffman 编码长度均为 9 bits + let mut sym_costs = vec![9u32; 512]; + let mut best_ops = vec![]; + + const NUM_ITERATIONS: usize = 4; + for iter in 0..NUM_ITERATIONS { + let mut costs = vec![u32::MAX; n + 1]; + let mut links = vec![None; n + 1]; + costs[0] = 0; + + // 动态规划寻找最短路径 + for i in 0..n { + let current_cost = costs[i]; + if current_cost == u32::MAX { + continue; + } + + // 1. 尝试字面量 (Literal) + let lit_sym = data[i] as usize; + let lit_cost = current_cost + sym_costs[lit_sym]; + if lit_cost < costs[i + 1] { + costs[i + 1] = lit_cost; + links[i + 1] = Some(LzssOp::Literal(data[i])); + } + + // 2. 尝试匹配 (Match) + let (max_len, offset) = longest_matches[i]; + for len in 2..=max_len { + let match_sym = 256 + (len - 2); + // 匹配的代价 = 当前代价 + 长度符号的 Huffman 代价 + 固定的 12 bits 偏移量代价 + let match_cost = current_cost + sym_costs[match_sym] + 12; + if match_cost < costs[i + len] { + costs[i + len] = match_cost; + links[i + len] = Some(LzssOp::Match { + len: len as u16, + offset: offset as u16, + }); + } + } + } + + // 回溯构建操作序列 + let mut ops = vec![]; + let mut curr = n; + while curr > 0 { + let op = links[curr].unwrap(); + ops.push(op); + curr -= match op { + LzssOp::Literal(_) => 1, + LzssOp::Match { len, .. } => len as usize, + }; + } + ops.reverse(); + + if iter == NUM_ITERATIONS - 1 { + best_ops = ops; + break; + } + + // 统计频率并更新 Huffman 树代价 + let mut freqs = vec![0u32; 512]; + for op in &ops { + match op { + LzssOp::Literal(b) => freqs[*b as usize] += 1, + LzssOp::Match { len, .. } => freqs[256 + (*len - 2) as usize] += 1, + } + } + + let depths = calculate_huffman_depths(&freqs); + for i in 0..512 { + sym_costs[i] = if depths[i] > 0 { + depths[i] as u32 + } else { + 9 // 对于未使用的符号,赋予一个平均惩罚代价 + }; + } + } + + best_ops +} + /// Encoder for Buriko General Interpreter/Ethornell compressed files (DSC format). pub struct DscEncoder<'a, T: Write + Seek> { stream: MsbBitWriter<'a, BufWriter>, @@ -533,7 +648,7 @@ pub struct DscEncoder<'a, T: Write + Seek> { } impl<'a, T: Write + Seek> DscEncoder<'a, T> { - /// Creates a new DscEncoder with the given writer and compression level (0-9). + /// Creates a new DscEncoder with the given writer and compression level (0-10). pub fn new(writer: &'a mut BufWriter, level: u8) -> Self { let stream = MsbBitWriter::new(writer); DscEncoder { @@ -541,94 +656,99 @@ impl<'a, T: Write + Seek> DscEncoder<'a, T> { magic: 0x5344 << 16, // "DS" key: rand::rng().random(), dec_count: 0, - level: level.min(9), + level: level.min(10), } } /// Packs the given data into the DSC format using configured LZSS compression. pub fn pack(mut self, data: &[u8]) -> Result<()> { - let mut ops = vec![]; - let mut pos = 0; let config = &COMPRESS_CONFIGS[self.level as usize]; - // 预分配哈希表,65536 对应 2 bytes 的所有可能 - let mut head = vec![-1i32; 1 << 16]; - let mut prev = vec![-1i32; data.len()]; - let insert_limit = data.len().saturating_sub(1); // 防止 data[p + 1] 越界 + let ops = if config.mode == MatchMode::Optimal { + optimal_parse(data, config) + } else { + let mut ops = vec![]; + let mut pos = 0; + // 预分配哈希表,65536 对应 2 bytes 的所有可能 + let mut head = vec![-1i32; 1 << 16]; + let mut prev = vec![-1i32; data.len()]; + let insert_limit = data.len().saturating_sub(1); // 防止 data[p + 1] 越界 - while pos < data.len() { - if config.mode == MatchMode::Store { - ops.push(LzssOp::Literal(data[pos])); - pos += 1; - continue; - } - - let (match_len, match_offset) = find_match(data, pos, &head, &prev, config); - - if match_len >= 2 { - let mut lazy_match = false; - - // 延迟匹配逻辑 (Lazy Evaluation) - if config.mode == MatchMode::Lazy - && match_len <= config.max_lazy - && pos + 1 < data.len() - { - // 为下一次尝试预先将当前 pos 插入字典 - if pos < insert_limit { - let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize); - prev[pos] = head[key]; - head[key] = pos as i32; - } - - let (next_len, _) = find_match(data, pos + 1, &head, &prev, config); - - if next_len > match_len { - lazy_match = true; - } - } - - if lazy_match { + while pos < data.len() { + if config.mode == MatchMode::Store { ops.push(LzssOp::Literal(data[pos])); pos += 1; continue; } - ops.push(LzssOp::Match { - len: match_len as u16, - offset: match_offset as u16, - }); + let (match_len, match_offset) = find_match(data, pos, &head, &prev, config); - let start_insert = if config.mode == MatchMode::Lazy - && match_len <= config.max_lazy - && pos + 1 < data.len() - { - 1 // 如果进行了延迟检查,pos 已被插入,从 1 开始 - } else { - 0 - }; + if match_len >= 2 { + let mut lazy_match = false; - // 批量插入字典,使用 usize 强制类型,移除闭包产生的隐式开销 - if config.mode != MatchMode::Rle { - for i in start_insert..match_len { - let p = pos + i; - if p < insert_limit { - let key = ((data[p] as usize) << 8) | (data[p + 1] as usize); - prev[p] = head[key]; - head[key] = p as i32; + // 延迟匹配逻辑 (Lazy Evaluation) + if config.mode == MatchMode::Lazy + && match_len <= config.max_lazy + && pos + 1 < data.len() + { + // 为下一次尝试预先将当前 pos 插入字典 + if pos < insert_limit { + let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize); + prev[pos] = head[key]; + head[key] = pos as i32; + } + + let (next_len, _) = find_match(data, pos + 1, &head, &prev, config); + + if next_len > match_len { + lazy_match = true; } } + + if lazy_match { + ops.push(LzssOp::Literal(data[pos])); + pos += 1; + continue; + } + + ops.push(LzssOp::Match { + len: match_len as u16, + offset: match_offset as u16, + }); + + let start_insert = if config.mode == MatchMode::Lazy + && match_len <= config.max_lazy + && pos + 1 < data.len() + { + 1 // 如果进行了延迟检查,pos 已被插入,从 1 开始 + } else { + 0 + }; + + // 批量插入字典,使用 usize 强制类型,移除闭包产生的隐式开销 + if config.mode != MatchMode::Rle { + for i in start_insert..match_len { + let p = pos + i; + if p < insert_limit { + let key = ((data[p] as usize) << 8) | (data[p + 1] as usize); + prev[p] = head[key]; + head[key] = p as i32; + } + } + } + pos += match_len; + } else { + ops.push(LzssOp::Literal(data[pos])); + if config.mode != MatchMode::Rle && pos < insert_limit { + let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize); + prev[pos] = head[key]; + head[key] = pos as i32; + } + pos += 1; } - pos += match_len; - } else { - ops.push(LzssOp::Literal(data[pos])); - if config.mode != MatchMode::Rle && pos < insert_limit { - let key = ((data[pos] as usize) << 8) | (data[pos + 1] as usize); - prev[pos] = head[key]; - head[key] = pos as i32; - } - pos += 1; } - } + ops + }; let symbols: Vec = ops .iter() @@ -824,5 +944,5 @@ impl Script for Dsc { /// Parses the compression level for LZSS compression from a string. pub fn parse_compress_level(level: &str) -> Result { - number_range(level, 0, 9).map(|v| v as u8) + number_range(level, 0, 10).map(|v| v as u8) } diff --git a/src/types.rs b/src/types.rs index c079bbe..0b0884e 100644 --- a/src/types.rs +++ b/src/types.rs @@ -350,7 +350,8 @@ pub struct ExtraConfig { pub bgi_compress_file: bool, #[cfg(feature = "bgi-arc")] #[default(9)] - /// Compress level for BGI Dsc file. 0 means store, 9 mean best compression. + /// Compress level for BGI Dsc file. 0 means store, 10 mean best compression. + /// 10 will use zopfli like compression method, this may cost a lot of time. pub bgi_compress_level: u8, #[cfg(feature = "emote-img")] /// Whether to overlay PIMG images. (By default, true if all layers are not group layers.)