diff --git a/remove_dict.py b/remove_dict.py new file mode 100644 index 0000000..6fe76d2 --- /dev/null +++ b/remove_dict.py @@ -0,0 +1,78 @@ +from csv import DictReader, DictWriter +import os +from json import load + + +def process_message(mes: str) -> str: + return "".join(i.strip() for i in mes.splitlines()) + + +def read_dict(file_path: str) -> list: + result = [] + with open(file_path, "r", encoding="utf-8") as f: + reader = DictReader(f, delimiter="\t") + for row in reader: + result.append(row) + return result + + +def save_dict(data: list, file_path: str): + if not data: + return + with open(file_path, "w", encoding="utf-8", newline="") as f: + fieldnames = data[0].keys() + writer = DictWriter(f, fieldnames=fieldnames, delimiter="\t") + writer.writeheader() + for row in data: + writer.writerow(row) + + +def process_dict(dict_path: str, json_path: str): + d = read_dict(dict_path) + count_ref = {} + for entry in d: + count_ref[entry['word']] = {} + for root, _, files in os.walk(json_path): + for file in files: + if file.endswith(".json"): + full_path = os.path.join(root, file) + with open(full_path, "r", encoding="utf-8") as f: + mes = load(f) + for item in mes: + if 'name' in item: + name = item['name'] + if name in count_ref: + if file not in count_ref[name]: + count_ref[name][file] = 1 + else: + count_ref[name][file] += 1 + mess = process_message(item.get('message', '')) + for ent in d: + word = ent['word'] + if word in mess: + if file not in count_ref[word]: + count_ref[word][file] = 1 + else: + count_ref[word][file] += 1 + result = [] + removed = 0 + for entry in d: + word = entry['word'] + file_count = len(count_ref[word]) + count = sum(count_ref[word].values()) + print(f"Word: {word}, Files: {file_count}, Total Count: {count}") + if file_count > 1: + result.append(entry) + else: + removed += 1 + print(f"Removed {removed} entries with occurrences in 1 or fewer files.") + save_dict(result, dict_path) + + +if __name__ == "__main__": + from argparse import ArgumentParser + parser = ArgumentParser(description="Process dictionary and JSON files to count word occurrences.") + parser.add_argument("dict_path", help="Path to the TSV dictionary file.") + parser.add_argument("json_path", help="Path to the directory containing JSON files.") + args = parser.parse_args() + process_dict(args.dict_path, args.json_path)