Files
pythonscript/count_tokens.py

46 lines
1.6 KiB
Python

import json
import tiktoken
import argparse
def count_tokens(text, encoding):
return len(encoding.encode(text))
def calculate_tokens_in_file(file_path, encoding_name, model_name, overhead):
if encoding_name:
encoding = tiktoken.get_encoding(encoding_name)
elif model_name:
encoding = tiktoken.encoding_for_model(model_name)
else:
encoding = tiktoken.get_encoding('cl100k_base')
print('Encoding name:', encoding.name)
total_tokens = 0
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
data = json.loads(line)
messages = data.get('messages', [])
for message in messages:
content = message.get('content', '')
tokens = count_tokens(content, encoding)
total_tokens += tokens + overhead
print(f"Total tokens in file: {total_tokens}")
def main():
parser = argparse.ArgumentParser(description="Calculate the number of tokens in a JSONL file.") # noqa: E501
parser.add_argument('file_path', type=str, help='Path to the JSONL file')
parser.add_argument('-e', '--encoding', type=str, help='Encoding to use for tokenization (default: cl100k_base)') # noqa: E501
parser.add_argument('-m', '--model', type=str, help='Encoding model')
parser.add_argument('-o', '--overhead', type=int, default=3, help='Overhead token count for each message. Default: 3') # noqa: E501
args = parser.parse_args()
calculate_tokens_in_file(args.file_path, args.encoding, args.model,
args.overhead)
if __name__ == "__main__":
main()