Spaces:

anveshplus
/

BPE-Tokenizer

Sleeping

App Files Files Community

BPE-Tokenizer / encoder_parallel_telugu.py

anveshplus

updated

e4d5fc0 12 months ago

raw

history blame contribute delete

2.92 kB

	import time
	import concurrent.futures
	from tqdm import tqdm
	import pandas as pd
	import re

	# Function to encode a chunk of tokens into UTF-8 and return as bytes
	def encode_chunk(chunk):
	# Encode each token in the chunk to UTF-8
	return [token.encode('utf-8') for token in chunk]

	# Main function to handle parallel encoding and return concatenated results
	def encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10):
	# Split the tokens into chunks of size chunk_size (1 million tokens per chunk)
	chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

	# Prepare the progress bar
	total_chunks = len(chunks)

	# Use ProcessPoolExecutor to process chunks in parallel
	with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
	# Use tqdm to show a progress bar while processing chunks
	encoded_chunks = list(tqdm(executor.map(encode_chunk, chunks), total=total_chunks, desc="Processing Chunks"))

	# Concatenate all encoded chunks into a single list
	concatenated_encoded = [token for chunk in encoded_chunks for token in chunk]

	return concatenated_encoded

	def load_telugu_texts():
	file_paths = [
	'/Users/anvesh/codebase/llm/data/telugu_books/telugu_books.csv',
	'/Users/anvesh/codebase/llm/data/telugu_news/1_telugu_news.csv',
	'/Users/anvesh/codebase/llm/data/telugu_news/2_telugu_news.csv'
	]

	# Combine data from all files
	telugu_texts = []
	for file_path in file_paths:
	df = pd.read_csv(file_path)
	if 'text' in df.columns:
	telugu_texts.append(' '.join(df['text'].astype(str).tolist()))
	elif 'body' in df.columns:
	telugu_texts.append(' '.join(df['body'].astype(str).tolist()))
	# Concatenate all texts and remove all English, numerical values, quotes, and characters outside the UTF-8 range 0x0C00 to 0x0C7F, including special characters like @, #, $, and %.
	telugu_text = ' '.join(telugu_texts)
	telugu_text = re.sub(r'[^\u0C00-\u0C7F@#$%]', '', telugu_text) # Remove characters outside the specified UTF-8 range and special characters
	telugu_text = re.sub(r'[\r\n\xa0]', '', telugu_text) # Remove line breaks and non-breaking spaces
	return telugu_text

	# Main script
	if __name__ == '__main__':
	# Load the Telugu texts
	tokens = load_telugu_texts()
	# Start the timer
	start_time = time.time()

	# Encode the tokens in parallel and get concatenated results
	encoded_tokens = encode_tokens_parallel(tokens, chunk_size=1_000_000, max_workers=10)
	print(encoded_tokens[:100])
	print(len(encoded_tokens))
	# End the timer
	end_time = time.time()

	# Calculate the time taken
	time_taken = end_time - start_time

	print(f"Time taken to encode and process tokens in parallel: {time_taken:.4f} seconds")
	print("Encoding and processing completed!")