Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| # ============================================================================= | |
| # CONFIGURATION - Update these paths as needed | |
| # ============================================================================= | |
| ASR_RESULTS_PATH = "asr_results (2).csv" | |
| SUBMISSION_PATH = "75.csv" | |
| # ============================================================================= | |
| # LOAD DATA | |
| # ============================================================================= | |
| print("Loading data files...") | |
| df_asr = pd.read_csv(ASR_RESULTS_PATH) | |
| df_submission = pd.read_csv(SUBMISSION_PATH) | |
| # ============================================================================= | |
| # FUNCTION DEFINITIONS | |
| # ============================================================================= | |
| def has_request_time(text): | |
| """ | |
| Check if text contains time request pattern like "ขอเวลา ... นาที" or "ขอเวลา ... ชั่วโมง" | |
| Args: | |
| text: Input text to search | |
| Returns: | |
| bool: True if pattern is found, False otherwise | |
| """ | |
| return bool(re.search(r"ขอเวลา\s*\d+\s*(นาที|ชั่วโมง)", str(text))) | |
| def name_match(row, transcription): | |
| """ | |
| Check if both first and last name appear in transcription | |
| Args: | |
| row: DataFrame row containing first_name and last_name | |
| transcription: Text to search in | |
| Returns: | |
| bool: True if both names are found, False otherwise | |
| """ | |
| first = str(row["first_name"]) | |
| last = str(row["last_name"]) | |
| text = str(transcription) | |
| return (first in text) and (last in text) | |
| # ============================================================================= | |
| # TASK 1: Find rows containing "สวัสดี" (greeting) | |
| # ============================================================================= | |
| print("\n=== TASK 1: Finding greeting patterns ===") | |
| rows_with_sawasdee = df_asr[ | |
| df_asr.apply(lambda row: row.astype(str).str.contains("สวัสดี").any(), axis=1) | |
| ] | |
| print(f"Found {len(rows_with_sawasdee)} rows with greeting patterns") | |
| print(rows_with_sawasdee) | |
| # Update submission file for greeting column | |
| matching_ids_greeting = set(rows_with_sawasdee["id"]) | |
| mask_greeting = df_submission["id"].isin(matching_ids_greeting) & ( | |
| df_submission["กล่าวสวัสดี"] == False | |
| ) | |
| num_changed_greeting = mask_greeting.sum() | |
| # Apply changes to submission file | |
| df_submission.loc[mask_greeting, "กล่าวสวัสดี"] = True | |
| print(f"Number of rows updated for greeting: {num_changed_greeting}") | |
| # ============================================================================= | |
| # TASK 2: Find rows with time request patterns | |
| # ============================================================================= | |
| print("\n=== TASK 2: Finding time request patterns ===") | |
| rows_with_request_time = df_asr[df_asr["transcription"].apply(has_request_time)] | |
| print(f"Found {len(rows_with_request_time)} rows with time request patterns") | |
| print(rows_with_request_time[["id", "transcription"]]) | |
| # Update submission file for time request column | |
| request_time_ids = set(rows_with_request_time["id"]) | |
| mask_time = df_submission["id"].isin(request_time_ids) & ( | |
| df_submission["บอกระยะเวลาที่ใช้ในการเข้าพบ"] == False | |
| ) | |
| num_changed_time = mask_time.sum() | |
| # Apply changes to submission file | |
| df_submission.loc[mask_time, "บอกระยะเวลาที่ใช้ในการเข้าพบ"] = True | |
| print(f"Number of rows updated for time request: {num_changed_time}") | |
| # ============================================================================= | |
| # TASK 3: Name matching analysis | |
| # ============================================================================= | |
| print("\n=== TASK 3: Name matching analysis ===") | |
| # Find name matches using the updated submission file | |
| matched_rows = [] | |
| for _, sub_row in df_submission.iterrows(): | |
| # Find all rows in df_asr where both first and last name appear in transcription | |
| matches = df_asr[df_asr["transcription"].apply(lambda t: name_match(sub_row, t))] | |
| for _, asr_row in matches.iterrows(): | |
| matched_rows.append( | |
| { | |
| "id": asr_row["id"], | |
| "first_name": sub_row["first_name"], | |
| "last_name": sub_row["last_name"], | |
| "transcription": asr_row["transcription"], | |
| } | |
| ) | |
| # Convert to DataFrame and display results | |
| df_matched = pd.DataFrame(matched_rows) | |
| print(f"Found {len(df_matched)} name matches") | |
| print(df_matched) | |
| # ============================================================================= | |
| # SAVE UPDATED SUBMISSION FILE | |
| # ============================================================================= | |
| print("\n=== SAVING UPDATED SUBMISSION FILE ===") | |
| df_submission.to_csv(SUBMISSION_PATH, index=False) | |
| print(f"Updated submission file saved as: {SUBMISSION_PATH}") | |
| # ============================================================================= | |
| # SUMMARY | |
| # ============================================================================= | |
| print("\n=== PROCESSING SUMMARY ===") | |
| print(f"✓ Greeting patterns found: {len(rows_with_sawasdee)}") | |
| print(f"✓ Greeting updates applied: {num_changed_greeting}") | |
| print(f"✓ Time request patterns found: {len(rows_with_request_time)}") | |
| print(f"✓ Time request updates applied: {num_changed_time}") | |
| print(f"✓ Name matches found: {len(df_matched)}") | |
| print(f"✓ All changes applied to: {SUBMISSION_PATH}") | |
| print(f"✓ Source data from: {ASR_RESULTS_PATH}") | |