| | """
|
| | Script 1: Extract random sentences from EN-HI and EN-PA parallel files
|
| | WITH PROGRESS BAR AND OPTIMIZATIONS
|
| | """
|
| |
|
| | import pandas as pd
|
| | import random
|
| | import ftfy
|
| | from langdetect import detect, LangDetectException
|
| | import re
|
| | import numpy as np
|
| | from pathlib import Path
|
| | from tqdm import tqdm
|
| | import time
|
| |
|
| | def clean_text(text):
|
| | """Basic text cleaning - optimized"""
|
| | if not isinstance(text, str):
|
| | return ""
|
| |
|
| |
|
| | if text == 'nan' or pd.isna(text):
|
| | return ""
|
| |
|
| | text = ftfy.fix_text(text)
|
| | text = re.sub(r'\s+', ' ', text)
|
| | text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
| | return text.strip()
|
| |
|
| | def is_valid_sentence_fast(text, target_lang):
|
| | """Optimized version without langdetect for initial filtering"""
|
| | if not text or len(text) < 20:
|
| | return False
|
| |
|
| |
|
| | words = text.split()
|
| | if len(words) < 5 or len(words) > 50:
|
| | return False
|
| |
|
| |
|
| | unique_chars = len(set(text))
|
| | if unique_chars < 7:
|
| | return False
|
| |
|
| |
|
| | if target_lang == 'en':
|
| |
|
| | if not re.search(r'[a-zA-Z]', text):
|
| | return False
|
| | elif target_lang == 'hi':
|
| |
|
| | if not re.search(r'[\u0900-\u097F]', text):
|
| | return False
|
| | elif target_lang == 'pa':
|
| |
|
| | if not re.search(r'[\u0A00-\u0A7F]', text):
|
| | return False
|
| |
|
| | return True
|
| |
|
| | def is_valid_sentence_with_lang(text, target_lang, use_fast=True):
|
| | """Full validation with optional langdetect"""
|
| | if not is_valid_sentence_fast(text, target_lang):
|
| | return False
|
| |
|
| |
|
| | if not use_fast:
|
| | try:
|
| | detected = detect(text)
|
| | lang_map = {
|
| | 'hi': ['hi'],
|
| | 'pa': ['pa'],
|
| | 'en': ['en']
|
| | }
|
| |
|
| | if target_lang in lang_map and detected not in lang_map[target_lang]:
|
| | if target_lang == 'en' and detected not in ['hi', 'pa', 'mr', 'gu']:
|
| | return True
|
| | elif target_lang in ['hi', 'pa'] and detected not in ['en']:
|
| | return True
|
| | return False
|
| | except LangDetectException:
|
| | pass
|
| |
|
| | return True
|
| |
|
| | def extract_from_parallel_csv_optimized(input_csv, output_dir, en_samples, other_samples, other_lang_code):
|
| | """
|
| | Extract random sentences from parallel CSV - OPTIMIZED
|
| | """
|
| | print(f"\n{'='*60}")
|
| | print(f"Processing {input_csv}...")
|
| | print(f"Target: {en_samples} EN, {other_samples} {other_lang_code}")
|
| | print('='*60)
|
| |
|
| | start_time = time.time()
|
| |
|
| |
|
| | print("Reading CSV file...")
|
| | try:
|
| | df = pd.read_csv(input_csv, on_bad_lines='skip')
|
| | except Exception as e:
|
| | print(f"Error reading {input_csv}: {e}")
|
| |
|
| | try:
|
| | df = pd.read_csv(input_csv, encoding='latin-1', on_bad_lines='skip')
|
| | except:
|
| | print(f"Failed to read {input_csv}")
|
| | return [], []
|
| |
|
| | print(f"Loaded {len(df):,} rows")
|
| | print(f"Columns: {list(df.columns)}")
|
| |
|
| |
|
| | src_col = 'src' if 'src' in df.columns else df.columns[1]
|
| | tgt_col = 'tgt' if 'tgt' in df.columns else df.columns[2]
|
| | print(f"Source: {src_col}, Target: {tgt_col}")
|
| |
|
| |
|
| | print("\nCleaning data...")
|
| | df_clean = df.copy()
|
| |
|
| |
|
| | valid_src = []
|
| | valid_src_indices = []
|
| | print(f"Processing {src_col} column...")
|
| | for idx, text in tqdm(enumerate(df[src_col].astype(str)), total=len(df), desc="Cleaning English"):
|
| | cleaned = clean_text(text)
|
| | if len(cleaned) > 10:
|
| | valid_src.append(cleaned)
|
| | valid_src_indices.append(idx)
|
| |
|
| |
|
| | valid_tgt = []
|
| | valid_tgt_indices = []
|
| | print(f"\nProcessing {tgt_col} column...")
|
| | for idx, text in tqdm(enumerate(df[tgt_col].astype(str)), total=len(df), desc=f"Cleaning {other_lang_code}"):
|
| | cleaned = clean_text(text)
|
| | if len(cleaned) > 10:
|
| | valid_tgt.append(cleaned)
|
| | valid_tgt_indices.append(idx)
|
| |
|
| | print(f"\nAfter cleaning:")
|
| | print(f" Valid English sentences: {len(valid_src):,}")
|
| | print(f" Valid {other_lang_code} sentences: {len(valid_tgt):,}")
|
| |
|
| |
|
| | print("\nFast filtering sentences...")
|
| | fast_valid_en = []
|
| | for text in tqdm(valid_src, desc="Filtering English"):
|
| | if is_valid_sentence_fast(text, 'en'):
|
| | fast_valid_en.append(text)
|
| |
|
| | fast_valid_other = []
|
| | for text in tqdm(valid_tgt, desc=f"Filtering {other_lang_code}"):
|
| | if is_valid_sentence_fast(text, other_lang_code):
|
| | fast_valid_other.append(text)
|
| |
|
| | print(f"\nAfter fast filtering:")
|
| | print(f" English: {len(fast_valid_en):,}")
|
| | print(f" {other_lang_code}: {len(fast_valid_other):,}")
|
| |
|
| |
|
| |
|
| | if len(fast_valid_en) >= en_samples and len(fast_valid_other) >= other_samples:
|
| | final_en = fast_valid_en
|
| | final_other = fast_valid_other
|
| | print("Using fast-filtered sentences (skipping langdetect)")
|
| | else:
|
| |
|
| | print("\nApplying language detection on subset...")
|
| |
|
| |
|
| | sample_en = fast_valid_en[:100000] if len(fast_valid_en) > 100000 else fast_valid_en
|
| | sample_other = fast_valid_other[:100000] if len(fast_valid_other) > 100000 else fast_valid_other
|
| |
|
| | final_en = []
|
| | print("Validating English with langdetect...")
|
| | for text in tqdm(sample_en, desc="English langdetect"):
|
| | if is_valid_sentence_with_lang(text, 'en', use_fast=False):
|
| | final_en.append(text)
|
| |
|
| | final_other = []
|
| | print(f"Validating {other_lang_code} with langdetect...")
|
| | for text in tqdm(sample_other, desc=f"{other_lang_code} langdetect"):
|
| | if is_valid_sentence_with_lang(text, other_lang_code, use_fast=False):
|
| | final_other.append(text)
|
| |
|
| | print(f"\nAfter langdetect:")
|
| | print(f" English: {len(final_en):,}")
|
| | print(f" {other_lang_code}: {len(final_other):,}")
|
| |
|
| |
|
| | en_samples = min(en_samples, len(final_en))
|
| | other_samples = min(other_samples, len(final_other))
|
| |
|
| | print(f"\nSampling {en_samples:,} English and {other_samples:,} {other_lang_code} sentences...")
|
| |
|
| | sampled_en = random.sample(final_en, en_samples)
|
| | sampled_other = random.sample(final_other, other_samples)
|
| |
|
| |
|
| | output_dir = Path(output_dir)
|
| | output_dir.mkdir(parents=True, exist_ok=True)
|
| |
|
| |
|
| | en_filename = output_dir / f'en_{other_lang_code}_english.txt'
|
| | with open(en_filename, 'w', encoding='utf-8') as f:
|
| | for sentence in sampled_en:
|
| | f.write(f"{sentence}\n")
|
| |
|
| |
|
| | other_filename = output_dir / f'en_{other_lang_code}_{other_lang_code}.txt'
|
| | with open(other_filename, 'w', encoding='utf-8') as f:
|
| | for sentence in sampled_other:
|
| | f.write(f"{sentence}\n")
|
| |
|
| | elapsed = time.time() - start_time
|
| | print(f"\n✓ Saved {en_samples:,} English sentences to: {en_filename}")
|
| | print(f"✓ Saved {other_samples:,} {other_lang_code} sentences to: {other_filename}")
|
| | print(f"⏱️ Processing time: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")
|
| |
|
| | return sampled_en, sampled_other
|
| |
|
| | def main():
|
| |
|
| | EN_HI_CSV = "en-hi.csv"
|
| | EN_PA_CSV = "en-pa.csv"
|
| | OUTPUT_DIR = "./extracted_sentences"
|
| |
|
| |
|
| |
|
| | EN_HI_EN_SAMPLES = 150000
|
| | EN_HI_HI_SAMPLES = 300000
|
| | EN_PA_EN_SAMPLES = 150000
|
| | EN_PA_PA_SAMPLES = 300000
|
| |
|
| | print("="*70)
|
| | print("MULTILINGUAL DATA EXTRACTION TOOL")
|
| | print("="*70)
|
| |
|
| |
|
| | random.seed(42)
|
| | np.random.seed(42)
|
| |
|
| |
|
| | print("\n" + "="*70)
|
| | print("EXTRACTING FROM ENGLISH-HINDI DATASET")
|
| | print("="*70)
|
| |
|
| | en_hi_en, en_hi_hi = extract_from_parallel_csv_optimized(
|
| | EN_HI_CSV, OUTPUT_DIR,
|
| | EN_HI_EN_SAMPLES, EN_HI_HI_SAMPLES, 'hi'
|
| | )
|
| |
|
| |
|
| | print("\n" + "="*70)
|
| | print("EXTRACTING FROM ENGLISH-PUNJABI DATASET")
|
| | print("="*70)
|
| |
|
| | en_pa_en, en_pa_pa = extract_from_parallel_csv_optimized(
|
| | EN_PA_CSV, OUTPUT_DIR,
|
| | EN_PA_EN_SAMPLES, EN_PA_PA_SAMPLES, 'pa'
|
| | )
|
| |
|
| |
|
| | print("\n" + "="*70)
|
| | print("CREATING COMBINED ENGLISH FILE")
|
| | print("="*70)
|
| |
|
| | all_english = en_hi_en + en_pa_en
|
| | random.shuffle(all_english)
|
| |
|
| | combined_filename = Path(OUTPUT_DIR) / "combined_english.txt"
|
| | with open(combined_filename, 'w', encoding='utf-8') as f:
|
| | for sentence in all_english[:100000]:
|
| | f.write(f"{sentence}\n")
|
| |
|
| | print(f"\n✓ Saved {min(100000, len(all_english)):,} combined English sentences")
|
| |
|
| |
|
| | print("\n" + "="*70)
|
| | print("EXTRACTION COMPLETE - FINAL STATISTICS")
|
| | print("="*70)
|
| | print(f"Total English sentences: {len(all_english):,}")
|
| | print(f"Total Hindi sentences: {len(en_hi_hi):,}")
|
| | print(f"Total Punjabi sentences: {len(en_pa_pa):,}")
|
| |
|
| |
|
| | summary_file = Path(OUTPUT_DIR) / "extraction_summary.txt"
|
| | with open(summary_file, 'w', encoding='utf-8') as f:
|
| | f.write("DATA EXTRACTION SUMMARY\n")
|
| | f.write("="*50 + "\n\n")
|
| | f.write(f"English-Hindi Dataset:\n")
|
| | f.write(f" English sentences: {len(en_hi_en):,}\n")
|
| | f.write(f" Hindi sentences: {len(en_hi_hi):,}\n\n")
|
| | f.write(f"English-Punjabi Dataset:\n")
|
| | f.write(f" English sentences: {len(en_pa_en):,}\n")
|
| | f.write(f" Punjabi sentences: {len(en_pa_pa):,}\n\n")
|
| | f.write(f"Combined English: {min(100000, len(all_english)):,}\n")
|
| | f.write(f"Total corpus size: {len(all_english) + len(en_hi_hi) + len(en_pa_pa):,} sentences\n")
|
| |
|
| | print(f"\n📊 Summary saved to: {summary_file}")
|
| | print("\n✅ All done! Ready for corpus creation.")
|
| |
|
| | if __name__ == "__main__":
|
| |
|
| | try:
|
| | from tqdm import tqdm
|
| | except ImportError:
|
| | print("Installing tqdm for progress bars...")
|
| | import subprocess
|
| | subprocess.check_call(["pip", "install", "tqdm"])
|
| | from tqdm import tqdm
|
| |
|
| | main() |