from csv import DictReader, DictWriter import os from json import load def process_message(mes: str) -> str: return "".join(i.strip() for i in mes.splitlines()) def read_dict(file_path: str) -> list: result = [] with open(file_path, "r", encoding="utf-8") as f: reader = DictReader(f, delimiter="\t") for row in reader: result.append(row) return result def save_dict(data: list, file_path: str): if not data: return with open(file_path, "w", encoding="utf-8", newline="") as f: fieldnames = data[0].keys() writer = DictWriter(f, fieldnames=fieldnames, delimiter="\t") writer.writeheader() for row in data: writer.writerow(row) def process_dict(dict_path: str, json_path: str): d = read_dict(dict_path) count_ref = {} for entry in d: count_ref[entry['word']] = {} for root, _, files in os.walk(json_path): for file in files: if file.endswith(".json"): full_path = os.path.join(root, file) with open(full_path, "r", encoding="utf-8") as f: mes = load(f) for item in mes: if 'name' in item: name = item['name'] if name in count_ref: if file not in count_ref[name]: count_ref[name][file] = 1 else: count_ref[name][file] += 1 mess = process_message(item.get('message', '')) for ent in d: word = ent['word'] if word in mess: if file not in count_ref[word]: count_ref[word][file] = 1 else: count_ref[word][file] += 1 result = [] removed = 0 for entry in d: word = entry['word'] file_count = len(count_ref[word]) count = sum(count_ref[word].values()) print(f"Word: {word}, Files: {file_count}, Total Count: {count}") if file_count > 1: result.append(entry) else: removed += 1 print(f"Removed {removed} entries with occurrences in 1 or fewer files.") save_dict(result, dict_path) if __name__ == "__main__": from argparse import ArgumentParser parser = ArgumentParser(description="Process dictionary and JSON files to count word occurrences.") parser.add_argument("dict_path", help="Path to the TSV dictionary file.") parser.add_argument("json_path", help="Path to the directory containing JSON files.") args = parser.parse_args() process_dict(args.dict_path, args.json_path)