From 56a79106c4c2b4b6c49b38ff885969fa33663aa8 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Mon, 15 Sep 2025 11:40:21 +0800 Subject: [PATCH] Add do not break chinese word support for fixed formatter --- Cargo.lock | 214 +++++++++++++++++++++++++++++++++-- Cargo.toml | 4 +- src/args.rs | 8 ++ src/format/fixed.rs | 180 ++++++++++++++++++++++++++++- src/format/mod.rs | 14 ++- src/main.rs | 12 +- src/scripts/bgi/script.rs | 4 + src/scripts/circus/script.rs | 4 + src/types.rs | 6 + 9 files changed, 426 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fe67db7..81183e5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -14,6 +14,24 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -23,6 +41,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "anstream" version = "0.6.20" @@ -187,6 +211,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cedarwood" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" +dependencies = [ + "smallvec", +] + [[package]] name = "cfg-if" version = "1.0.3" @@ -326,6 +359,15 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + [[package]] name = "cpufeatures" version = "0.2.17" @@ -426,6 +468,12 @@ dependencies = [ "windows-sys 0.61.0", ] +[[package]] +name = "dary_heap" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" + [[package]] name = "dataview" version = "1.0.1" @@ -620,6 +668,12 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + [[package]] name = "fdeflate" version = "0.3.7" @@ -754,6 +808,10 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "hashbrown" @@ -922,6 +980,43 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "include-flate" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01b7cb6ca682a621e7cda1c358c9724b53a7b4409be9be1dd443b7f3a26f998" +dependencies = [ + "include-flate-codegen", + "include-flate-compress", + "libflate", + "zstd", +] + +[[package]] +name = "include-flate-codegen" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f49bf5274aebe468d6e6eba14a977eaf1efa481dc173f361020de70c1c48050" +dependencies = [ + "include-flate-compress", + "libflate", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 2.0.106", + "zstd", +] + +[[package]] +name = "include-flate-compress" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae6a40e716bcd5931f5dbb79cd921512a4f647e2e9413fded3171fca3824dbc" +dependencies = [ + "libflate", + "zstd", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -975,6 +1070,29 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "jieba-macros" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "348294e44ee7e3c42685da656490f8febc7359632544019621588902216da95c" +dependencies = [ + "phf_codegen 0.13.1", +] + +[[package]] +name = "jieba-rs" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "766bd7012aa5ba49411ebdf4e93bddd59b182d2918e085d58dec5bb9b54b7105" +dependencies = [ + "cedarwood", + "include-flate", + "jieba-macros", + "phf 0.13.1", + "regex", + "rustc-hash", +] + [[package]] name = "jobserver" version = "0.1.34" @@ -1013,6 +1131,30 @@ dependencies = [ "libc", ] +[[package]] +name = "libflate" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" +dependencies = [ + "adler32", + "core2", + "crc32fast", + "dary_heap", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" +dependencies = [ + "core2", + "hashbrown 0.14.5", + "rle-decode-fast", +] + [[package]] name = "libtlg-rs" version = "0.2.2" @@ -1169,6 +1311,7 @@ dependencies = [ "fancy-regex", "flate2", "int-enum", + "jieba-rs", "json", "lazy_static", "libflac-sys", @@ -1195,7 +1338,7 @@ dependencies = [ "url", "utf16string", "webp", - "windows-sys 0.59.0", + "windows-sys 0.61.0", "xml5ever", "zstd", ] @@ -1358,7 +1501,17 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ - "phf_shared", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_shared 0.13.1", + "serde", ] [[package]] @@ -1367,8 +1520,18 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" dependencies = [ - "phf_generator", - "phf_shared", + "phf_generator 0.11.3", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_codegen" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", ] [[package]] @@ -1377,10 +1540,20 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ - "phf_shared", + "phf_shared 0.11.3", "rand 0.8.5", ] +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared 0.13.1", +] + [[package]] name = "phf_shared" version = "0.11.3" @@ -1390,6 +1563,15 @@ dependencies = [ "siphasher", ] +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + [[package]] name = "pkg-config" version = "0.3.32" @@ -1584,6 +1766,12 @@ dependencies = [ "bytemuck", ] +[[package]] +name = "rle-decode-fast" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" + [[package]] name = "rust-ini" version = "0.21.3" @@ -1594,6 +1782,12 @@ dependencies = [ "ordered-multimap", ] +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "ryu" version = "1.0.20" @@ -1737,7 +1931,7 @@ checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" dependencies = [ "new_debug_unreachable", "parking_lot", - "phf_shared", + "phf_shared 0.11.3", "precomputed-hash", "serde", ] @@ -1748,8 +1942,8 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" dependencies = [ - "phf_generator", - "phf_shared", + "phf_generator 0.11.3", + "phf_shared 0.11.3", "proc-macro2", "quote", ] @@ -2062,8 +2256,8 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414" dependencies = [ - "phf", - "phf_codegen", + "phf 0.11.3", + "phf_codegen 0.11.3", "string_cache", "string_cache_codegen", ] diff --git a/Cargo.toml b/Cargo.toml index a25e06b..d63a62b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ encoding = "0.2" fancy-regex = { version = "0.16", optional = true } flate2 = { version = "1.1", optional = true } int-enum = { version = "1.2", optional = true } +jieba-rs = { version = "0.8", optional = true } json = { version = "0.12", optional = true } jpegxl-sys = { package = "msg-tool-jpegxl-sys", version = "0.11", optional = true, features = ["vendored"] } lazy_static = "1.5.0" @@ -47,7 +48,7 @@ xml5ever = { version = "0.35", optional = true } zstd = { version = "0.13", optional = true } [features] -default = ["all-fmt", "image-jpg", "image-jxl", "image-webp", "audio-flac"] +default = ["all-fmt", "image-jpg", "image-jxl", "image-webp", "audio-flac", "jieba"] all-fmt = ["all-script", "all-img", "all-arc", "all-audio"] all-script = ["artemis", "artemis-panmimisoft", "bgi", "cat-system", "circus", "entis-gls", "escude", "ex-hibit", "favorite", "hexen-haus", "kirikiri", "silky", "softpal", "will-plus", "yaneurao", "yaneurao-itufuru"] all-img = ["bgi-img", "cat-system-img", "circus-img", "emote-img", "kirikiri-img"] @@ -89,6 +90,7 @@ image-webp = ["webp"] lossless-audio = ["utils-pcm"] audio-flac = ["libflac-sys", "utils-pcm"] unstable = ["msg_tool_macro/unstable"] +jieba = ["jieba-rs"] # utils feature utils-bit-stream = [] utils-blowfish = ["byteorder"] diff --git a/src/args.rs b/src/args.rs index 61598ec..2f538ae 100644 --- a/src/args.rs +++ b/src/args.rs @@ -460,6 +460,10 @@ pub struct Arg { /// Workers count for encode images in parallel. Default is half of CPU cores. /// Set this to 1 to disable parallel encoding. 0 means same as 1. pub image_workers: usize, + #[cfg(feature = "jieba")] + #[arg(long, global = true)] + /// Path to custom jieba dictionary + pub jieba_dict: Option, #[command(subcommand)] /// Command pub command: Command, @@ -516,6 +520,10 @@ pub struct ImportArgs { #[arg(long, action = ArgAction::SetTrue)] /// If a line break occurs in the middle of some symbols, bring the sentence to next line (for fixed format) pub patched_break_with_sentence: bool, + #[cfg(feature = "jieba")] + #[arg(long, action = ArgAction::SetTrue)] + /// Whether to disable break Chinese words at the end of the line. + pub patched_no_break_chinese_words: bool, #[arg(long)] /// Name table file pub name_csv: Option, diff --git a/src/format/fixed.rs b/src/format/fixed.rs index 4790886..96e5db1 100644 --- a/src/format/fixed.rs +++ b/src/format/fixed.rs @@ -1,9 +1,12 @@ use crate::types::*; +use anyhow::Result; +#[cfg(feature = "jieba")] +use jieba_rs::Jieba; use unicode_segmentation::UnicodeSegmentation; const SPACE_STR_LIST: [&str; 2] = [" ", " "]; const QUOTE_LIST: [(&str, &str); 4] = [("「", "」"), ("『", "』"), ("(", ")"), ("【", "】")]; -const BREAK_SENTENCE_SYMBOLS: [&str; 5] = ["…", ",", "。", "?", "!"]; +const BREAK_SENTENCE_SYMBOLS: [&str; 6] = ["…", ",", "。", "?", "!", "—"]; fn check_is_ascii_alphanumeric(s: &str) -> bool { for c in s.chars() { @@ -45,6 +48,27 @@ fn check_is_end_quote(segs: &[&str], pos: usize) -> bool { true } +#[cfg(feature = "jieba")] +fn check_chinese_word_is_break(segs: &[&str], pos: usize, jieba: &Jieba) -> bool { + let s = segs.join(""); + let mut breaked = jieba + .cut(&s, false) + .iter() + .map(|s| s.graphemes(true).count()) + .collect::>(); + let mut sum = 0; + for i in breaked.iter_mut() { + sum += *i; + *i = sum; + } + breaked.binary_search(&pos).is_err() +} + +#[cfg(not(feature = "jieba"))] +fn check_chinese_word_is_break(_segs: &[&str], _pos: usize, _jieba: &()) -> bool { + false +} + pub struct FixedFormatter { length: usize, keep_original: bool, @@ -54,6 +78,11 @@ pub struct FixedFormatter { insert_fullwidth_space_at_line_start: bool, /// If a line break occurs in the middle of some symbols, bring the sentence to next line break_with_sentence: bool, + #[cfg(feature = "jieba")] + /// Jieba instance for Chinese word segmentation. + jieba: Option, + #[cfg(not(feature = "jieba"))] + jieba: Option<()>, #[allow(unused)] typ: Option, } @@ -65,16 +94,34 @@ impl FixedFormatter { break_words: bool, insert_fullwidth_space_at_line_start: bool, break_with_sentence: bool, + #[cfg(feature = "jieba")] break_chinese_words: bool, + #[cfg(feature = "jieba")] jieba_dict: Option, typ: Option, - ) -> Self { - FixedFormatter { + ) -> Result { + #[cfg(feature = "jieba")] + let jieba = if !break_chinese_words { + let mut jieba = Jieba::new(); + if let Some(dict) = jieba_dict { + let file = std::fs::File::open(dict)?; + let mut reader = std::io::BufReader::new(file); + jieba.load_dict(&mut reader)?; + } + Some(jieba) + } else { + None + }; + Ok(FixedFormatter { length, keep_original, break_words, insert_fullwidth_space_at_line_start, break_with_sentence, + #[cfg(feature = "jieba")] + jieba, + #[cfg(not(feature = "jieba"))] + jieba: None, typ, - } + }) } #[cfg(test)] @@ -85,6 +132,7 @@ impl FixedFormatter { break_words: true, insert_fullwidth_space_at_line_start: false, break_with_sentence: false, + jieba: None, typ: None, } } @@ -113,7 +161,27 @@ impl FixedFormatter { self } + #[cfg(all(feature = "jieba", test))] + fn break_chinese_words(mut self, break_chinese_words: bool) -> Result { + if !break_chinese_words { + let jieba = Jieba::new(); + self.jieba = Some(jieba); + } else { + self.jieba = None; + } + Ok(self) + } + + #[cfg(all(feature = "jieba", test))] + fn add_dict(mut self, dict: &str, freq: Option, tag: Option<&str>) -> Self { + if let Some(ref mut jieba) = self.jieba { + jieba.add_word(&dict, freq, tag); + } + self + } + #[cfg(test)] + #[allow(dead_code)] fn typ(mut self, typ: Option) -> Self { self.typ = typ; self @@ -318,6 +386,81 @@ impl FixedFormatter { main_content.clear(); pre_is_lf = true; } + } else if self + .jieba + .as_ref() + .is_some_and(|s| check_chinese_word_is_break(&vec, i, s)) + && !is_command + && !is_ruby_rt + { + #[cfg(feature = "jieba")] + { + let jieba = self.jieba.as_ref().unwrap(); + let s = vec.join(""); + let mut breaked = jieba + .cut(&s, false) + .iter() + .map(|s| s.graphemes(true).count()) + .collect::>(); + let mut sum = 0; + for i in breaked.iter_mut() { + sum += *i; + *i = sum; + } + let break_pos = match breaked.binary_search(&i) { + Ok(pos) => Some(pos), + Err(pos) => { + if pos == 0 { + None + } else { + Some(pos - 1) + } + } + }; + if let Some(break_pos) = break_pos { + let pos = breaked[break_pos]; + let segs = result.graphemes(true).collect::>(); + let remain_count = i - pos; + let pos = segs.len() - remain_count; + let remaining = segs[pos..].concat().trim_start().to_string(); + result = segs[..pos].concat(); + result.push('\n'); + current_length = 0; + if first_line { + if self.insert_fullwidth_space_at_line_start { + if check_need_fullwidth_space(&main_content) { + need_insert_fullwidth_space = true; + } + } + first_line = false; + } + if need_insert_fullwidth_space { + result.push(' '); + current_length += 1; + } + result.push_str(&remaining); + current_length += remaining.graphemes(true).count(); + main_content.clear(); + pre_is_lf = true; + } else { + result.push('\n'); + current_length = 0; + if first_line { + if self.insert_fullwidth_space_at_line_start { + if check_need_fullwidth_space(&main_content) { + need_insert_fullwidth_space = true; + } + } + first_line = false; + } + if need_insert_fullwidth_space { + result.push(' '); + current_length += 1; + } + main_content.clear(); + pre_is_lf = true; + } + } } else { result.push('\n'); current_length = 0; @@ -408,7 +551,7 @@ impl FixedFormatter { i += 1; } - return result; + result } } @@ -532,4 +675,31 @@ fn test_format() { "%test;[ruby]测[test]试打\n断。" ); } + #[cfg(feature = "jieba")] + { + let jieba_formatter = FixedFormatter::builder(8) + .break_words(false) + .break_chinese_words(false) + .unwrap(); + assert_eq!( + jieba_formatter.format("测试分词,我们中出了一个叛徒。"), + "测试分词,我们中\n出了一个叛徒。" + ); + let jieba_formatter2 = FixedFormatter::builder(8) + .break_words(false) + .break_chinese_words(false) + .unwrap() + .add_dict("中出", Some(114514), None); + assert_eq!( + jieba_formatter2 + .jieba + .as_ref() + .is_some_and(|s| s.has_word("中出")), + true + ); + assert_eq!( + jieba_formatter2.format("测试分词,我们中出了一个叛徒。"), + "测试分词,我们\n中出了一个叛徒。" + ); + } } diff --git a/src/format/mod.rs b/src/format/mod.rs index 70fc7b1..74cd5ac 100644 --- a/src/format/mod.rs +++ b/src/format/mod.rs @@ -2,9 +2,10 @@ mod fixed; use crate::types::*; +use anyhow::Result; /// Formats messages with the given options. -pub fn fmt_message(mes: &mut Vec, opt: FormatOptions, typ: ScriptType) { +pub fn fmt_message(mes: &mut Vec, opt: FormatOptions, typ: ScriptType) -> Result<()> { match opt { FormatOptions::Fixed { length, @@ -12,6 +13,10 @@ pub fn fmt_message(mes: &mut Vec, opt: FormatOptions, typ: ScriptType) break_words, insert_fullwidth_space_at_line_start, break_with_sentence, + #[cfg(feature = "jieba")] + break_chinese_words, + #[cfg(feature = "jieba")] + jieba_dict, } => { let formatter = fixed::FixedFormatter::new( length, @@ -19,12 +24,17 @@ pub fn fmt_message(mes: &mut Vec, opt: FormatOptions, typ: ScriptType) break_words, insert_fullwidth_space_at_line_start, break_with_sentence, + #[cfg(feature = "jieba")] + break_chinese_words, + #[cfg(feature = "jieba")] + jieba_dict, Some(typ), - ); + )?; for message in mes.iter_mut() { message.message = formatter.format(&message.message); } } FormatOptions::None => {} } + Ok(()) } diff --git a/src/main.rs b/src/main.rs index 9927ac5..2b9f3cb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1373,6 +1373,10 @@ pub fn import_script( insert_fullwidth_space_at_line_start: imp_cfg .patched_insert_fullwidth_space_at_line_start, break_with_sentence: imp_cfg.patched_break_with_sentence, + #[cfg(feature = "jieba")] + break_chinese_words: !imp_cfg.patched_no_break_chinese_words, + #[cfg(feature = "jieba")] + jieba_dict: arg.jieba_dict.clone(), }, types::FormatType::None => types::FormatOptions::None, }, @@ -1384,7 +1388,7 @@ pub fn import_script( } None => {} } - format::fmt_message(&mut mes, fmt, *builder.script_type()); + format::fmt_message(&mut mes, fmt, *builder.script_type())?; if let Err(e) = script_file.import_messages( mes, writer, @@ -1592,6 +1596,10 @@ pub fn import_script( insert_fullwidth_space_at_line_start: imp_cfg .patched_insert_fullwidth_space_at_line_start, break_with_sentence: imp_cfg.patched_break_with_sentence, + #[cfg(feature = "jieba")] + break_chinese_words: !imp_cfg.patched_no_break_chinese_words, + #[cfg(feature = "jieba")] + jieba_dict: arg.jieba_dict.clone(), }, types::FormatType::None => types::FormatOptions::None, }, @@ -1603,7 +1611,7 @@ pub fn import_script( } None => {} } - format::fmt_message(&mut mes, fmt, *builder.script_type()); + format::fmt_message(&mut mes, fmt, *builder.script_type())?; script.import_messages_filename(mes, &patched_f, encoding, repl)?; Ok(types::ScriptResult::Ok) diff --git a/src/scripts/bgi/script.rs b/src/scripts/bgi/script.rs index 86223a4..44efaa5 100644 --- a/src/scripts/bgi/script.rs +++ b/src/scripts/bgi/script.rs @@ -170,6 +170,10 @@ impl Script for BGIScript { break_words: false, insert_fullwidth_space_at_line_start: true, break_with_sentence: true, + #[cfg(feature = "jieba")] + break_chinese_words: true, + #[cfg(feature = "jieba")] + jieba_dict: None, } } } diff --git a/src/scripts/circus/script.rs b/src/scripts/circus/script.rs index 0eeefef..25bd9df 100644 --- a/src/scripts/circus/script.rs +++ b/src/scripts/circus/script.rs @@ -220,6 +220,10 @@ impl Script for CircusMesScript { break_words: false, insert_fullwidth_space_at_line_start: true, break_with_sentence: true, + #[cfg(feature = "jieba")] + break_chinese_words: true, + #[cfg(feature = "jieba")] + jieba_dict: None, } } diff --git a/src/types.rs b/src/types.rs index 33e4d4c..31ac597 100644 --- a/src/types.rs +++ b/src/types.rs @@ -664,6 +664,12 @@ pub enum FormatOptions { insert_fullwidth_space_at_line_start: bool, /// If a line break occurs in the middle of some symbols, bring the sentence to next line break_with_sentence: bool, + #[cfg(feature = "jieba")] + /// Whether to break Chinese words at the end of the line. + break_chinese_words: bool, + #[cfg(feature = "jieba")] + /// Path to custom jieba dictionary + jieba_dict: Option, }, /// Do not wrap line None,