Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import sys | |
| import pandas as pd | |
| # pylint: disable=wrong-import-position | |
| root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| sys.path.append(root_dir) | |
| from graphgen.models import Tokenizer | |
| def count_tokens(file, tokenizer_name, data_frame): | |
| if not file or not os.path.exists(file): | |
| return data_frame | |
| if file.endswith(".jsonl"): | |
| with open(file, "r", encoding="utf-8") as f: | |
| data = [json.loads(line) for line in f] | |
| elif file.endswith(".json"): | |
| with open(file, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| data = [item for sublist in data for item in sublist] | |
| elif file.endswith(".txt"): | |
| with open(file, "r", encoding="utf-8") as f: | |
| data = f.read() | |
| chunks = [data[i : i + 512] for i in range(0, len(data), 512)] | |
| data = [{"content": chunk} for chunk in chunks] | |
| elif file.endswith(".csv"): | |
| df = pd.read_csv(file) | |
| if "content" in df.columns: | |
| data = df["content"].tolist() | |
| else: | |
| data = df.iloc[:, 0].tolist() | |
| else: | |
| raise ValueError(f"Unsupported file type: {file}") | |
| tokenizer = Tokenizer(tokenizer_name) | |
| # Count tokens | |
| token_count = 0 | |
| for item in data: | |
| if isinstance(item, dict): | |
| content = item.get("content", "") | |
| else: | |
| content = item | |
| token_count += len(tokenizer.encode(content)) | |
| _update_data = [[str(token_count), str(token_count * 50), "N/A"]] | |
| try: | |
| new_df = pd.DataFrame(_update_data, columns=data_frame.columns) | |
| data_frame = new_df | |
| except Exception as e: # pylint: disable=broad-except | |
| print("[ERROR] DataFrame操作异常:", str(e)) | |
| return data_frame | |