Add remove_dict.py
This commit is contained in:
78
remove_dict.py
Normal file
78
remove_dict.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from csv import DictReader, DictWriter
|
||||
import os
|
||||
from json import load
|
||||
|
||||
|
||||
def process_message(mes: str) -> str:
|
||||
return "".join(i.strip() for i in mes.splitlines())
|
||||
|
||||
|
||||
def read_dict(file_path: str) -> list:
|
||||
result = []
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
reader = DictReader(f, delimiter="\t")
|
||||
for row in reader:
|
||||
result.append(row)
|
||||
return result
|
||||
|
||||
|
||||
def save_dict(data: list, file_path: str):
|
||||
if not data:
|
||||
return
|
||||
with open(file_path, "w", encoding="utf-8", newline="") as f:
|
||||
fieldnames = data[0].keys()
|
||||
writer = DictWriter(f, fieldnames=fieldnames, delimiter="\t")
|
||||
writer.writeheader()
|
||||
for row in data:
|
||||
writer.writerow(row)
|
||||
|
||||
|
||||
def process_dict(dict_path: str, json_path: str):
|
||||
d = read_dict(dict_path)
|
||||
count_ref = {}
|
||||
for entry in d:
|
||||
count_ref[entry['word']] = {}
|
||||
for root, _, files in os.walk(json_path):
|
||||
for file in files:
|
||||
if file.endswith(".json"):
|
||||
full_path = os.path.join(root, file)
|
||||
with open(full_path, "r", encoding="utf-8") as f:
|
||||
mes = load(f)
|
||||
for item in mes:
|
||||
if 'name' in item:
|
||||
name = item['name']
|
||||
if name in count_ref:
|
||||
if file not in count_ref[name]:
|
||||
count_ref[name][file] = 1
|
||||
else:
|
||||
count_ref[name][file] += 1
|
||||
mess = process_message(item.get('message', ''))
|
||||
for ent in d:
|
||||
word = ent['word']
|
||||
if word in mess:
|
||||
if file not in count_ref[word]:
|
||||
count_ref[word][file] = 1
|
||||
else:
|
||||
count_ref[word][file] += 1
|
||||
result = []
|
||||
removed = 0
|
||||
for entry in d:
|
||||
word = entry['word']
|
||||
file_count = len(count_ref[word])
|
||||
count = sum(count_ref[word].values())
|
||||
print(f"Word: {word}, Files: {file_count}, Total Count: {count}")
|
||||
if file_count > 1:
|
||||
result.append(entry)
|
||||
else:
|
||||
removed += 1
|
||||
print(f"Removed {removed} entries with occurrences in 1 or fewer files.")
|
||||
save_dict(result, dict_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from argparse import ArgumentParser
|
||||
parser = ArgumentParser(description="Process dictionary and JSON files to count word occurrences.")
|
||||
parser.add_argument("dict_path", help="Path to the TSV dictionary file.")
|
||||
parser.add_argument("json_path", help="Path to the directory containing JSON files.")
|
||||
args = parser.parse_args()
|
||||
process_dict(args.dict_path, args.json_path)
|
||||
Reference in New Issue
Block a user