mirror of
https://github.com/lifegpc/msg-tool.git
synced 2026-06-16 01:54:19 +08:00
Add do not break chinese word support for fixed formatter
This commit is contained in:
214
Cargo.lock
generated
214
Cargo.lock
generated
@@ -14,6 +14,24 @@ version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
|
||||
|
||||
[[package]]
|
||||
name = "adler32"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.8.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
"version_check",
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
@@ -23,6 +41,12 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "allocator-api2"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.20"
|
||||
@@ -187,6 +211,15 @@ dependencies = [
|
||||
"shlex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cedarwood"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
|
||||
dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.3"
|
||||
@@ -326,6 +359,15 @@ dependencies = [
|
||||
"tiny-keccak",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core2"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.2.17"
|
||||
@@ -426,6 +468,12 @@ dependencies = [
|
||||
"windows-sys 0.61.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dary_heap"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728"
|
||||
|
||||
[[package]]
|
||||
name = "dataview"
|
||||
version = "1.0.1"
|
||||
@@ -620,6 +668,12 @@ dependencies = [
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
||||
|
||||
[[package]]
|
||||
name = "fdeflate"
|
||||
version = "0.3.7"
|
||||
@@ -754,6 +808,10 @@ name = "hashbrown"
|
||||
version = "0.14.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"allocator-api2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
@@ -922,6 +980,43 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "include-flate"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e01b7cb6ca682a621e7cda1c358c9724b53a7b4409be9be1dd443b7f3a26f998"
|
||||
dependencies = [
|
||||
"include-flate-codegen",
|
||||
"include-flate-compress",
|
||||
"libflate",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "include-flate-codegen"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4f49bf5274aebe468d6e6eba14a977eaf1efa481dc173f361020de70c1c48050"
|
||||
dependencies = [
|
||||
"include-flate-compress",
|
||||
"libflate",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.106",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "include-flate-compress"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eae6a40e716bcd5931f5dbb79cd921512a4f647e2e9413fded3171fca3824dbc"
|
||||
dependencies = [
|
||||
"libflate",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "1.9.3"
|
||||
@@ -975,6 +1070,29 @@ version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
||||
|
||||
[[package]]
|
||||
name = "jieba-macros"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "348294e44ee7e3c42685da656490f8febc7359632544019621588902216da95c"
|
||||
dependencies = [
|
||||
"phf_codegen 0.13.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jieba-rs"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "766bd7012aa5ba49411ebdf4e93bddd59b182d2918e085d58dec5bb9b54b7105"
|
||||
dependencies = [
|
||||
"cedarwood",
|
||||
"include-flate",
|
||||
"jieba-macros",
|
||||
"phf 0.13.1",
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.34"
|
||||
@@ -1013,6 +1131,30 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libflate"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e"
|
||||
dependencies = [
|
||||
"adler32",
|
||||
"core2",
|
||||
"crc32fast",
|
||||
"dary_heap",
|
||||
"libflate_lz77",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libflate_lz77"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d"
|
||||
dependencies = [
|
||||
"core2",
|
||||
"hashbrown 0.14.5",
|
||||
"rle-decode-fast",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libtlg-rs"
|
||||
version = "0.2.2"
|
||||
@@ -1169,6 +1311,7 @@ dependencies = [
|
||||
"fancy-regex",
|
||||
"flate2",
|
||||
"int-enum",
|
||||
"jieba-rs",
|
||||
"json",
|
||||
"lazy_static",
|
||||
"libflac-sys",
|
||||
@@ -1195,7 +1338,7 @@ dependencies = [
|
||||
"url",
|
||||
"utf16string",
|
||||
"webp",
|
||||
"windows-sys 0.59.0",
|
||||
"windows-sys 0.61.0",
|
||||
"xml5ever",
|
||||
"zstd",
|
||||
]
|
||||
@@ -1358,7 +1501,17 @@ version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
|
||||
dependencies = [
|
||||
"phf_shared",
|
||||
"phf_shared 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
|
||||
dependencies = [
|
||||
"phf_shared 0.13.1",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1367,8 +1520,18 @@ version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_shared",
|
||||
"phf_generator 0.11.3",
|
||||
"phf_shared 0.11.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
|
||||
dependencies = [
|
||||
"phf_generator 0.13.1",
|
||||
"phf_shared 0.13.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1377,10 +1540,20 @@ version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
|
||||
dependencies = [
|
||||
"phf_shared",
|
||||
"phf_shared 0.11.3",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"phf_shared 0.13.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.11.3"
|
||||
@@ -1390,6 +1563,15 @@ dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pkg-config"
|
||||
version = "0.3.32"
|
||||
@@ -1584,6 +1766,12 @@ dependencies = [
|
||||
"bytemuck",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rle-decode-fast"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
|
||||
|
||||
[[package]]
|
||||
name = "rust-ini"
|
||||
version = "0.21.3"
|
||||
@@ -1594,6 +1782,12 @@ dependencies = [
|
||||
"ordered-multimap",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.20"
|
||||
@@ -1737,7 +1931,7 @@ checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
|
||||
dependencies = [
|
||||
"new_debug_unreachable",
|
||||
"parking_lot",
|
||||
"phf_shared",
|
||||
"phf_shared 0.11.3",
|
||||
"precomputed-hash",
|
||||
"serde",
|
||||
]
|
||||
@@ -1748,8 +1942,8 @@ version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_shared",
|
||||
"phf_generator 0.11.3",
|
||||
"phf_shared 0.11.3",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
]
|
||||
@@ -2062,8 +2256,8 @@ version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414"
|
||||
dependencies = [
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"phf 0.11.3",
|
||||
"phf_codegen 0.11.3",
|
||||
"string_cache",
|
||||
"string_cache_codegen",
|
||||
]
|
||||
|
||||
@@ -18,6 +18,7 @@ encoding = "0.2"
|
||||
fancy-regex = { version = "0.16", optional = true }
|
||||
flate2 = { version = "1.1", optional = true }
|
||||
int-enum = { version = "1.2", optional = true }
|
||||
jieba-rs = { version = "0.8", optional = true }
|
||||
json = { version = "0.12", optional = true }
|
||||
jpegxl-sys = { package = "msg-tool-jpegxl-sys", version = "0.11", optional = true, features = ["vendored"] }
|
||||
lazy_static = "1.5.0"
|
||||
@@ -47,7 +48,7 @@ xml5ever = { version = "0.35", optional = true }
|
||||
zstd = { version = "0.13", optional = true }
|
||||
|
||||
[features]
|
||||
default = ["all-fmt", "image-jpg", "image-jxl", "image-webp", "audio-flac"]
|
||||
default = ["all-fmt", "image-jpg", "image-jxl", "image-webp", "audio-flac", "jieba"]
|
||||
all-fmt = ["all-script", "all-img", "all-arc", "all-audio"]
|
||||
all-script = ["artemis", "artemis-panmimisoft", "bgi", "cat-system", "circus", "entis-gls", "escude", "ex-hibit", "favorite", "hexen-haus", "kirikiri", "silky", "softpal", "will-plus", "yaneurao", "yaneurao-itufuru"]
|
||||
all-img = ["bgi-img", "cat-system-img", "circus-img", "emote-img", "kirikiri-img"]
|
||||
@@ -89,6 +90,7 @@ image-webp = ["webp"]
|
||||
lossless-audio = ["utils-pcm"]
|
||||
audio-flac = ["libflac-sys", "utils-pcm"]
|
||||
unstable = ["msg_tool_macro/unstable"]
|
||||
jieba = ["jieba-rs"]
|
||||
# utils feature
|
||||
utils-bit-stream = []
|
||||
utils-blowfish = ["byteorder"]
|
||||
|
||||
@@ -460,6 +460,10 @@ pub struct Arg {
|
||||
/// Workers count for encode images in parallel. Default is half of CPU cores.
|
||||
/// Set this to 1 to disable parallel encoding. 0 means same as 1.
|
||||
pub image_workers: usize,
|
||||
#[cfg(feature = "jieba")]
|
||||
#[arg(long, global = true)]
|
||||
/// Path to custom jieba dictionary
|
||||
pub jieba_dict: Option<String>,
|
||||
#[command(subcommand)]
|
||||
/// Command
|
||||
pub command: Command,
|
||||
@@ -516,6 +520,10 @@ pub struct ImportArgs {
|
||||
#[arg(long, action = ArgAction::SetTrue)]
|
||||
/// If a line break occurs in the middle of some symbols, bring the sentence to next line (for fixed format)
|
||||
pub patched_break_with_sentence: bool,
|
||||
#[cfg(feature = "jieba")]
|
||||
#[arg(long, action = ArgAction::SetTrue)]
|
||||
/// Whether to disable break Chinese words at the end of the line.
|
||||
pub patched_no_break_chinese_words: bool,
|
||||
#[arg(long)]
|
||||
/// Name table file
|
||||
pub name_csv: Option<String>,
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
use crate::types::*;
|
||||
use anyhow::Result;
|
||||
#[cfg(feature = "jieba")]
|
||||
use jieba_rs::Jieba;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
const SPACE_STR_LIST: [&str; 2] = [" ", " "];
|
||||
const QUOTE_LIST: [(&str, &str); 4] = [("「", "」"), ("『", "』"), ("(", ")"), ("【", "】")];
|
||||
const BREAK_SENTENCE_SYMBOLS: [&str; 5] = ["…", ",", "。", "?", "!"];
|
||||
const BREAK_SENTENCE_SYMBOLS: [&str; 6] = ["…", ",", "。", "?", "!", "—"];
|
||||
|
||||
fn check_is_ascii_alphanumeric(s: &str) -> bool {
|
||||
for c in s.chars() {
|
||||
@@ -45,6 +48,27 @@ fn check_is_end_quote(segs: &[&str], pos: usize) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
#[cfg(feature = "jieba")]
|
||||
fn check_chinese_word_is_break(segs: &[&str], pos: usize, jieba: &Jieba) -> bool {
|
||||
let s = segs.join("");
|
||||
let mut breaked = jieba
|
||||
.cut(&s, false)
|
||||
.iter()
|
||||
.map(|s| s.graphemes(true).count())
|
||||
.collect::<Vec<_>>();
|
||||
let mut sum = 0;
|
||||
for i in breaked.iter_mut() {
|
||||
sum += *i;
|
||||
*i = sum;
|
||||
}
|
||||
breaked.binary_search(&pos).is_err()
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "jieba"))]
|
||||
fn check_chinese_word_is_break(_segs: &[&str], _pos: usize, _jieba: &()) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
pub struct FixedFormatter {
|
||||
length: usize,
|
||||
keep_original: bool,
|
||||
@@ -54,6 +78,11 @@ pub struct FixedFormatter {
|
||||
insert_fullwidth_space_at_line_start: bool,
|
||||
/// If a line break occurs in the middle of some symbols, bring the sentence to next line
|
||||
break_with_sentence: bool,
|
||||
#[cfg(feature = "jieba")]
|
||||
/// Jieba instance for Chinese word segmentation.
|
||||
jieba: Option<Jieba>,
|
||||
#[cfg(not(feature = "jieba"))]
|
||||
jieba: Option<()>,
|
||||
#[allow(unused)]
|
||||
typ: Option<ScriptType>,
|
||||
}
|
||||
@@ -65,16 +94,34 @@ impl FixedFormatter {
|
||||
break_words: bool,
|
||||
insert_fullwidth_space_at_line_start: bool,
|
||||
break_with_sentence: bool,
|
||||
#[cfg(feature = "jieba")] break_chinese_words: bool,
|
||||
#[cfg(feature = "jieba")] jieba_dict: Option<String>,
|
||||
typ: Option<ScriptType>,
|
||||
) -> Self {
|
||||
FixedFormatter {
|
||||
) -> Result<Self> {
|
||||
#[cfg(feature = "jieba")]
|
||||
let jieba = if !break_chinese_words {
|
||||
let mut jieba = Jieba::new();
|
||||
if let Some(dict) = jieba_dict {
|
||||
let file = std::fs::File::open(dict)?;
|
||||
let mut reader = std::io::BufReader::new(file);
|
||||
jieba.load_dict(&mut reader)?;
|
||||
}
|
||||
Some(jieba)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(FixedFormatter {
|
||||
length,
|
||||
keep_original,
|
||||
break_words,
|
||||
insert_fullwidth_space_at_line_start,
|
||||
break_with_sentence,
|
||||
#[cfg(feature = "jieba")]
|
||||
jieba,
|
||||
#[cfg(not(feature = "jieba"))]
|
||||
jieba: None,
|
||||
typ,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -85,6 +132,7 @@ impl FixedFormatter {
|
||||
break_words: true,
|
||||
insert_fullwidth_space_at_line_start: false,
|
||||
break_with_sentence: false,
|
||||
jieba: None,
|
||||
typ: None,
|
||||
}
|
||||
}
|
||||
@@ -113,7 +161,27 @@ impl FixedFormatter {
|
||||
self
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "jieba", test))]
|
||||
fn break_chinese_words(mut self, break_chinese_words: bool) -> Result<Self> {
|
||||
if !break_chinese_words {
|
||||
let jieba = Jieba::new();
|
||||
self.jieba = Some(jieba);
|
||||
} else {
|
||||
self.jieba = None;
|
||||
}
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "jieba", test))]
|
||||
fn add_dict(mut self, dict: &str, freq: Option<usize>, tag: Option<&str>) -> Self {
|
||||
if let Some(ref mut jieba) = self.jieba {
|
||||
jieba.add_word(&dict, freq, tag);
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[allow(dead_code)]
|
||||
fn typ(mut self, typ: Option<ScriptType>) -> Self {
|
||||
self.typ = typ;
|
||||
self
|
||||
@@ -318,6 +386,81 @@ impl FixedFormatter {
|
||||
main_content.clear();
|
||||
pre_is_lf = true;
|
||||
}
|
||||
} else if self
|
||||
.jieba
|
||||
.as_ref()
|
||||
.is_some_and(|s| check_chinese_word_is_break(&vec, i, s))
|
||||
&& !is_command
|
||||
&& !is_ruby_rt
|
||||
{
|
||||
#[cfg(feature = "jieba")]
|
||||
{
|
||||
let jieba = self.jieba.as_ref().unwrap();
|
||||
let s = vec.join("");
|
||||
let mut breaked = jieba
|
||||
.cut(&s, false)
|
||||
.iter()
|
||||
.map(|s| s.graphemes(true).count())
|
||||
.collect::<Vec<_>>();
|
||||
let mut sum = 0;
|
||||
for i in breaked.iter_mut() {
|
||||
sum += *i;
|
||||
*i = sum;
|
||||
}
|
||||
let break_pos = match breaked.binary_search(&i) {
|
||||
Ok(pos) => Some(pos),
|
||||
Err(pos) => {
|
||||
if pos == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(pos - 1)
|
||||
}
|
||||
}
|
||||
};
|
||||
if let Some(break_pos) = break_pos {
|
||||
let pos = breaked[break_pos];
|
||||
let segs = result.graphemes(true).collect::<Vec<_>>();
|
||||
let remain_count = i - pos;
|
||||
let pos = segs.len() - remain_count;
|
||||
let remaining = segs[pos..].concat().trim_start().to_string();
|
||||
result = segs[..pos].concat();
|
||||
result.push('\n');
|
||||
current_length = 0;
|
||||
if first_line {
|
||||
if self.insert_fullwidth_space_at_line_start {
|
||||
if check_need_fullwidth_space(&main_content) {
|
||||
need_insert_fullwidth_space = true;
|
||||
}
|
||||
}
|
||||
first_line = false;
|
||||
}
|
||||
if need_insert_fullwidth_space {
|
||||
result.push(' ');
|
||||
current_length += 1;
|
||||
}
|
||||
result.push_str(&remaining);
|
||||
current_length += remaining.graphemes(true).count();
|
||||
main_content.clear();
|
||||
pre_is_lf = true;
|
||||
} else {
|
||||
result.push('\n');
|
||||
current_length = 0;
|
||||
if first_line {
|
||||
if self.insert_fullwidth_space_at_line_start {
|
||||
if check_need_fullwidth_space(&main_content) {
|
||||
need_insert_fullwidth_space = true;
|
||||
}
|
||||
}
|
||||
first_line = false;
|
||||
}
|
||||
if need_insert_fullwidth_space {
|
||||
result.push(' ');
|
||||
current_length += 1;
|
||||
}
|
||||
main_content.clear();
|
||||
pre_is_lf = true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
result.push('\n');
|
||||
current_length = 0;
|
||||
@@ -408,7 +551,7 @@ impl FixedFormatter {
|
||||
i += 1;
|
||||
}
|
||||
|
||||
return result;
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
@@ -532,4 +675,31 @@ fn test_format() {
|
||||
"%test;[ruby]测[test]试打\n断。"
|
||||
);
|
||||
}
|
||||
#[cfg(feature = "jieba")]
|
||||
{
|
||||
let jieba_formatter = FixedFormatter::builder(8)
|
||||
.break_words(false)
|
||||
.break_chinese_words(false)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
jieba_formatter.format("测试分词,我们中出了一个叛徒。"),
|
||||
"测试分词,我们中\n出了一个叛徒。"
|
||||
);
|
||||
let jieba_formatter2 = FixedFormatter::builder(8)
|
||||
.break_words(false)
|
||||
.break_chinese_words(false)
|
||||
.unwrap()
|
||||
.add_dict("中出", Some(114514), None);
|
||||
assert_eq!(
|
||||
jieba_formatter2
|
||||
.jieba
|
||||
.as_ref()
|
||||
.is_some_and(|s| s.has_word("中出")),
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
jieba_formatter2.format("测试分词,我们中出了一个叛徒。"),
|
||||
"测试分词,我们\n中出了一个叛徒。"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
mod fixed;
|
||||
|
||||
use crate::types::*;
|
||||
use anyhow::Result;
|
||||
|
||||
/// Formats messages with the given options.
|
||||
pub fn fmt_message(mes: &mut Vec<Message>, opt: FormatOptions, typ: ScriptType) {
|
||||
pub fn fmt_message(mes: &mut Vec<Message>, opt: FormatOptions, typ: ScriptType) -> Result<()> {
|
||||
match opt {
|
||||
FormatOptions::Fixed {
|
||||
length,
|
||||
@@ -12,6 +13,10 @@ pub fn fmt_message(mes: &mut Vec<Message>, opt: FormatOptions, typ: ScriptType)
|
||||
break_words,
|
||||
insert_fullwidth_space_at_line_start,
|
||||
break_with_sentence,
|
||||
#[cfg(feature = "jieba")]
|
||||
break_chinese_words,
|
||||
#[cfg(feature = "jieba")]
|
||||
jieba_dict,
|
||||
} => {
|
||||
let formatter = fixed::FixedFormatter::new(
|
||||
length,
|
||||
@@ -19,12 +24,17 @@ pub fn fmt_message(mes: &mut Vec<Message>, opt: FormatOptions, typ: ScriptType)
|
||||
break_words,
|
||||
insert_fullwidth_space_at_line_start,
|
||||
break_with_sentence,
|
||||
#[cfg(feature = "jieba")]
|
||||
break_chinese_words,
|
||||
#[cfg(feature = "jieba")]
|
||||
jieba_dict,
|
||||
Some(typ),
|
||||
);
|
||||
)?;
|
||||
for message in mes.iter_mut() {
|
||||
message.message = formatter.format(&message.message);
|
||||
}
|
||||
}
|
||||
FormatOptions::None => {}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
12
src/main.rs
12
src/main.rs
@@ -1373,6 +1373,10 @@ pub fn import_script(
|
||||
insert_fullwidth_space_at_line_start: imp_cfg
|
||||
.patched_insert_fullwidth_space_at_line_start,
|
||||
break_with_sentence: imp_cfg.patched_break_with_sentence,
|
||||
#[cfg(feature = "jieba")]
|
||||
break_chinese_words: !imp_cfg.patched_no_break_chinese_words,
|
||||
#[cfg(feature = "jieba")]
|
||||
jieba_dict: arg.jieba_dict.clone(),
|
||||
},
|
||||
types::FormatType::None => types::FormatOptions::None,
|
||||
},
|
||||
@@ -1384,7 +1388,7 @@ pub fn import_script(
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
format::fmt_message(&mut mes, fmt, *builder.script_type());
|
||||
format::fmt_message(&mut mes, fmt, *builder.script_type())?;
|
||||
if let Err(e) = script_file.import_messages(
|
||||
mes,
|
||||
writer,
|
||||
@@ -1592,6 +1596,10 @@ pub fn import_script(
|
||||
insert_fullwidth_space_at_line_start: imp_cfg
|
||||
.patched_insert_fullwidth_space_at_line_start,
|
||||
break_with_sentence: imp_cfg.patched_break_with_sentence,
|
||||
#[cfg(feature = "jieba")]
|
||||
break_chinese_words: !imp_cfg.patched_no_break_chinese_words,
|
||||
#[cfg(feature = "jieba")]
|
||||
jieba_dict: arg.jieba_dict.clone(),
|
||||
},
|
||||
types::FormatType::None => types::FormatOptions::None,
|
||||
},
|
||||
@@ -1603,7 +1611,7 @@ pub fn import_script(
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
format::fmt_message(&mut mes, fmt, *builder.script_type());
|
||||
format::fmt_message(&mut mes, fmt, *builder.script_type())?;
|
||||
|
||||
script.import_messages_filename(mes, &patched_f, encoding, repl)?;
|
||||
Ok(types::ScriptResult::Ok)
|
||||
|
||||
@@ -170,6 +170,10 @@ impl Script for BGIScript {
|
||||
break_words: false,
|
||||
insert_fullwidth_space_at_line_start: true,
|
||||
break_with_sentence: true,
|
||||
#[cfg(feature = "jieba")]
|
||||
break_chinese_words: true,
|
||||
#[cfg(feature = "jieba")]
|
||||
jieba_dict: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -220,6 +220,10 @@ impl Script for CircusMesScript {
|
||||
break_words: false,
|
||||
insert_fullwidth_space_at_line_start: true,
|
||||
break_with_sentence: true,
|
||||
#[cfg(feature = "jieba")]
|
||||
break_chinese_words: true,
|
||||
#[cfg(feature = "jieba")]
|
||||
jieba_dict: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -664,6 +664,12 @@ pub enum FormatOptions {
|
||||
insert_fullwidth_space_at_line_start: bool,
|
||||
/// If a line break occurs in the middle of some symbols, bring the sentence to next line
|
||||
break_with_sentence: bool,
|
||||
#[cfg(feature = "jieba")]
|
||||
/// Whether to break Chinese words at the end of the line.
|
||||
break_chinese_words: bool,
|
||||
#[cfg(feature = "jieba")]
|
||||
/// Path to custom jieba dictionary
|
||||
jieba_dict: Option<String>,
|
||||
},
|
||||
/// Do not wrap line
|
||||
None,
|
||||
|
||||
Reference in New Issue
Block a user