dataset-builder / data3 /check_match.py
SunDou's picture
Upload data3/check_match.py with huggingface_hub
9001624 verified
#!/usr/bin/env python3
import csv
import json
# Read row 57 from CSV (1-indexed, so row 56 in 0-indexed)
print("Reading row 57 from CSV...")
with open('function_dataset_v2.csv', 'r') as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
if i == 56: # 57th row (0-indexed)
print(f"Row {i+1}:")
print(f" original_index: {row['original_index']}")
print(f" repo_name: '{row['repo_name']}'")
print(f" path: '{row['path']}'")
print(f" language: '{row['language']}'")
print(f" function_name: '{row['function_name']}'")
break
# Also check a sample JSONL
print("\n\nChecking first JSONL entry...")
with open('programming_problems.jsonl', 'r') as f:
data = json.loads(f.readline())
print(f"original_index: {data['metadata']['original_index']}")
print(f"function_name: {data['metadata']['function_name']}")
print(f"Current repo_name: '{data['metadata']['repo_name']}'")
print(f"Current path: '{data['metadata']['path']}'")
print(f"Current language: '{data['metadata']['language']}'")
# Count how many rows in CSV have complete info
print("\n\nCounting CSV rows with complete metadata...")
with open('function_dataset_v2.csv', 'r') as f:
reader = csv.DictReader(f)
total = 0
complete = 0
for row in reader:
total += 1
if row['repo_name'] and row['path'] and row['language']:
complete += 1
print(f"Total CSV rows: {total}")
print(f"Rows with complete metadata: {complete}")
print(f"Rows with missing metadata: {total - complete}")