File size: 1,793 Bytes
d2a63cc
acd7cf4
 
d2a63cc
acd7cf4
 
 
 
 
 
 
d2a63cc
acd7cf4
 
 
 
 
d2a63cc
acd7cf4
 
d2a63cc
acd7cf4
 
 
d2a63cc
acd7cf4
d2a63cc
acd7cf4
d2a63cc
 
 
 
 
 
acd7cf4
 
 
 
 
 
 
 
 
 
 
 
 
3a3b216
acd7cf4
d2a63cc
acd7cf4
 
d2a63cc
acd7cf4
 
d2a63cc
acd7cf4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
import os
import sys

import pandas as pd

# pylint: disable=wrong-import-position
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_dir)
from graphgen.models import Tokenizer


def count_tokens(file, tokenizer_name, data_frame):
    if not file or not os.path.exists(file):
        return data_frame

    if file.endswith(".jsonl"):
        with open(file, "r", encoding="utf-8") as f:
            data = [json.loads(line) for line in f]
    elif file.endswith(".json"):
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            data = [item for sublist in data for item in sublist]
    elif file.endswith(".txt"):
        with open(file, "r", encoding="utf-8") as f:
            data = f.read()
            chunks = [data[i : i + 512] for i in range(0, len(data), 512)]
            data = [{"content": chunk} for chunk in chunks]
    elif file.endswith(".csv"):
        df = pd.read_csv(file)
        if "content" in df.columns:
            data = df["content"].tolist()
        else:
            data = df.iloc[:, 0].tolist()
    else:
        raise ValueError(f"Unsupported file type: {file}")

    tokenizer = Tokenizer(tokenizer_name)

    # Count tokens
    token_count = 0

    for item in data:
        if isinstance(item, dict):
            content = item.get("content", "")
        else:
            content = item
        token_count += len(tokenizer.encode(content))

    _update_data = [[str(token_count), str(token_count * 50), "N/A"]]

    try:
        new_df = pd.DataFrame(_update_data, columns=data_frame.columns)
        data_frame = new_df

    except Exception as e:  # pylint: disable=broad-except
        print("[ERROR] DataFrame操作异常:", str(e))

    return data_frame