From a6cc95b2ed69924fe677a447f93e955489a9a17e Mon Sep 17 00:00:00 2001
From: lifegpc <root@lifegpc.com>
Date: Mon, 30 Dec 2024 08:27:05 +0800
Subject: [PATCH] Add script to count tokens in JSONL files with customizable
 encoding and overhead

---
 count_tokens.py | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 count_tokens.py

diff --git a/count_tokens.py b/count_tokens.py
new file mode 100644
index 0000000..92c76fe
--- /dev/null
+++ b/count_tokens.py
@@ -0,0 +1,45 @@
+import json
+import tiktoken
+import argparse
+
+
+def count_tokens(text, encoding):
+    return len(encoding.encode(text))
+
+
+def calculate_tokens_in_file(file_path, encoding_name, model_name, overhead):
+    if encoding_name:
+        encoding = tiktoken.get_encoding(encoding_name)
+    elif model_name:
+        encoding = tiktoken.encoding_for_model(model_name)
+    else:
+        encoding = tiktoken.get_encoding('cl100k_base')
+    print('Encoding name:', encoding.name)
+    total_tokens = 0
+
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            data = json.loads(line)
+            messages = data.get('messages', [])
+            for message in messages:
+                content = message.get('content', '')
+                tokens = count_tokens(content, encoding)
+                total_tokens += tokens + overhead
+    print(f"Total tokens in file: {total_tokens}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Calculate the number of tokens in a JSONL file.")  # noqa: E501
+    parser.add_argument('file_path', type=str, help='Path to the JSONL file')
+    parser.add_argument('-e', '--encoding', type=str, help='Encoding to use for tokenization (default: cl100k_base)')  # noqa: E501
+    parser.add_argument('-m', '--model', type=str, help='Encoding model')
+    parser.add_argument('-o', '--overhead', type=int, default=3, help='Overhead token count for each message. Default: 3')  # noqa: E501
+
+    args = parser.parse_args()
+
+    calculate_tokens_in_file(args.file_path, args.encoding, args.model,
+                             args.overhead)
+
+
+if __name__ == "__main__":
+    main()