Add remove_dict.py

This commit is contained in:
2025-11-15 18:09:37 +08:00
parent bf88e7f977
commit f34f1e8380

78
remove_dict.py Normal file
View File

@@ -0,0 +1,78 @@
from csv import DictReader, DictWriter
import os
from json import load
def process_message(mes: str) -> str:
return "".join(i.strip() for i in mes.splitlines())
def read_dict(file_path: str) -> list:
result = []
with open(file_path, "r", encoding="utf-8") as f:
reader = DictReader(f, delimiter="\t")
for row in reader:
result.append(row)
return result
def save_dict(data: list, file_path: str):
if not data:
return
with open(file_path, "w", encoding="utf-8", newline="") as f:
fieldnames = data[0].keys()
writer = DictWriter(f, fieldnames=fieldnames, delimiter="\t")
writer.writeheader()
for row in data:
writer.writerow(row)
def process_dict(dict_path: str, json_path: str):
d = read_dict(dict_path)
count_ref = {}
for entry in d:
count_ref[entry['word']] = {}
for root, _, files in os.walk(json_path):
for file in files:
if file.endswith(".json"):
full_path = os.path.join(root, file)
with open(full_path, "r", encoding="utf-8") as f:
mes = load(f)
for item in mes:
if 'name' in item:
name = item['name']
if name in count_ref:
if file not in count_ref[name]:
count_ref[name][file] = 1
else:
count_ref[name][file] += 1
mess = process_message(item.get('message', ''))
for ent in d:
word = ent['word']
if word in mess:
if file not in count_ref[word]:
count_ref[word][file] = 1
else:
count_ref[word][file] += 1
result = []
removed = 0
for entry in d:
word = entry['word']
file_count = len(count_ref[word])
count = sum(count_ref[word].values())
print(f"Word: {word}, Files: {file_count}, Total Count: {count}")
if file_count > 1:
result.append(entry)
else:
removed += 1
print(f"Removed {removed} entries with occurrences in 1 or fewer files.")
save_dict(result, dict_path)
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser(description="Process dictionary and JSON files to count word occurrences.")
parser.add_argument("dict_path", help="Path to the TSV dictionary file.")
parser.add_argument("json_path", help="Path to the directory containing JSON files.")
args = parser.parse_args()
process_dict(args.dict_path, args.json_path)