BPE-Tokenizer / encoder_parallel_telugu.py
anveshplus's picture
updated
e4d5fc0
import time
import concurrent.futures
from tqdm import tqdm
import pandas as pd
import re
# Function to encode a chunk of tokens into UTF-8 and return as bytes
def encode_chunk(chunk):
# Encode each token in the chunk to UTF-8
return [token.encode('utf-8') for token in chunk]
# Main function to handle parallel encoding and return concatenated results
def encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10):
# Split the tokens into chunks of size chunk_size (1 million tokens per chunk)
chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
# Prepare the progress bar
total_chunks = len(chunks)
# Use ProcessPoolExecutor to process chunks in parallel
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
# Use tqdm to show a progress bar while processing chunks
encoded_chunks = list(tqdm(executor.map(encode_chunk, chunks), total=total_chunks, desc="Processing Chunks"))
# Concatenate all encoded chunks into a single list
concatenated_encoded = [token for chunk in encoded_chunks for token in chunk]
return concatenated_encoded
def load_telugu_texts():
file_paths = [
'/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
'/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
'/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
]
# Combine data from all files
telugu_texts = []
for file_path in file_paths:
df = pd.read_csv(file_path)
if 'text' in df.columns:
telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
elif 'body' in df.columns:
telugu_texts.append(' '.join(df['body'].astype(str).tolist()))
# Concatenate all texts and remove all English, numerical values, quotes, and characters outside the UTF-8 range 0x0C00 to 0x0C7F, including special characters like @, #, $, and %.
telugu_text = ' '.join(telugu_texts)
telugu_text = re.sub(r'[^\u0C00-\u0C7F@#$%]', '', telugu_text) # Remove characters outside the specified UTF-8 range and special characters
telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text) # Remove line breaks and non-breaking spaces
return telugu_text
# Main script
if __name__ == '__main__':
# Load the Telugu texts
tokens = load_telugu_texts()
# Start the timer
start_time = time.time()
# Encode the tokens in parallel and get concatenated results
encoded_tokens = encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
print(encoded_tokens[:100])
print(len(encoded_tokens))
# End the timer
end_time = time.time()
# Calculate the time taken
time_taken = end_time - start_time
print(f"Time taken to encode and process tokens in parallel: {time_taken:.4f} seconds")
print("Encoding and processing completed!")