|
|
|
|
|
""" |
|
|
Let's understand the relationship between the datasets by comparing a few records. |
|
|
""" |
|
|
import csv |
|
|
import json |
|
|
|
|
|
|
|
|
print("=== function_dataset_v2.csv structure ===") |
|
|
with open('function_dataset_v2.csv', 'r', encoding='utf-8') as f: |
|
|
reader = csv.DictReader(f) |
|
|
headers = reader.fieldnames |
|
|
print(f"Headers: {headers}") |
|
|
|
|
|
|
|
|
print("\nFinding a row with complete metadata...") |
|
|
for row in reader: |
|
|
if row['repo_name'] and row['path'] and row['language']: |
|
|
print(f"\nSample row WITH metadata:") |
|
|
print(f" original_index: {row['original_index']}") |
|
|
print(f" function_index: {row['function_index']}") |
|
|
print(f" repo_name: {row['repo_name']}") |
|
|
print(f" path: {row['path']}") |
|
|
print(f" language: {row['language']}") |
|
|
print(f" function_name: {row['function_name']}") |
|
|
break |
|
|
|
|
|
|
|
|
print("\n\n=== programming_problems.jsonl structure ===") |
|
|
with open('programming_problems.jsonl', 'r', encoding='utf-8') as f: |
|
|
|
|
|
for line in f: |
|
|
data = json.loads(line.strip()) |
|
|
|
|
|
print(f"First entry:") |
|
|
print(f" row_number: {data.get('row_number')}") |
|
|
print(f" metadata.original_index: {data['metadata']['original_index']}") |
|
|
print(f" metadata.function_name: {data['metadata']['function_name']}") |
|
|
print(f" metadata.repo_name: '{data['metadata']['repo_name']}'") |
|
|
print(f" metadata.path: '{data['metadata']['path']}'") |
|
|
print(f" metadata.language: '{data['metadata']['language']}'") |
|
|
break |
|
|
|
|
|
|
|
|
print("\n\n=== Checking if row_number matches CSV row ===") |
|
|
with open('programming_problems.jsonl', 'r', encoding='utf-8') as f: |
|
|
data = json.loads(f.readline()) |
|
|
target_row = data.get('row_number') |
|
|
print(f"JSONL row_number: {target_row}") |
|
|
|
|
|
|
|
|
with open('function_dataset_v2.csv', 'r', encoding='utf-8') as f: |
|
|
reader = csv.DictReader(f) |
|
|
for i, row in enumerate(reader): |
|
|
if i + 1 == target_row: |
|
|
print(f"\nCSV row {target_row}:") |
|
|
print(f" original_index: {row['original_index']}") |
|
|
print(f" repo_name: '{row['repo_name']}'") |
|
|
print(f" path: '{row['path']}'") |
|
|
print(f" language: '{row['language']}'") |
|
|
print(f" function_name: '{row['function_name']}'") |
|
|
|
|
|
|
|
|
if row['function_name'] == data['metadata']['function_name']: |
|
|
print(f"\n✅ Function names match! We should use row_number as the key.") |
|
|
else: |
|
|
print(f"\n❌ Function names don't match.") |
|
|
break |
|
|
|