File size: 6,463 Bytes
fe8e53f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python3
"""
Memory-efficient script to enrich programming_problems.jsonl
Only loads the exact rows we need from enhanced_dataset.csv
"""

import json
import csv
from tqdm import tqdm
import sys

def get_needed_original_indices(function_csv, input_jsonl):
    """
    Get the set of original_index values we actually need to look up.
    
    Returns:
        Dictionary mapping original_index to list of row_numbers that need it
    """
    print("Step 1: Determining which original_index values we need...")
    
    # First, get row_number to original_index mapping from function_dataset_v2
    row_to_original = {}
    with open(function_csv, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(tqdm(reader, desc="Reading function_dataset_v2"), start=1):
            try:
                original_index = int(row['original_index'])
                row_to_original[i] = original_index
            except (ValueError, KeyError):
                pass
    
    # Next, get the row_numbers from JSONL that we need to enrich
    needed_indices = {}
    with open(input_jsonl, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Reading JSONL", total=22532):
            data = json.loads(line.strip())
            row_number = data.get('row_number')
            
            if row_number in row_to_original:
                original_index = row_to_original[row_number]
                if original_index not in needed_indices:
                    needed_indices[original_index] = []
                needed_indices[original_index].append(row_number)
    
    print(f"Need to look up {len(needed_indices)} unique original_index values")
    print(f"Max index needed: {max(needed_indices.keys())}")
    print(f"Min index needed: {min(needed_indices.keys())}")
    
    return row_to_original, needed_indices


def load_needed_metadata(enhanced_csv, needed_indices):
    """
    Load only the needed rows from enhanced_dataset.csv.
    
    Args:
        enhanced_csv: Path to enhanced_dataset.csv  
        needed_indices: Set of original_index values we need
        
    Returns:
        Dictionary mapping original_index to {repo_name, path, language}
    """
    print("\nStep 2: Loading only needed rows from enhanced_dataset.csv...")
    print(f"Looking for {len(needed_indices)} unique indices...")
    print("This will scan the entire file - may take several minutes...")
    
    mapping = {}
    needed_remaining = set(needed_indices.keys())
    
    with open(enhanced_csv, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        
        for i, row in enumerate(tqdm(reader, desc="Reading enhanced_dataset")):
            # Get the index from various possible column names
            idx = row.get('', row.get('Unnamed: 0.1', row.get('Unnamed: 0')))
            if idx:
                try:
                    idx = int(idx)
                    if idx in needed_remaining:
                        mapping[idx] = {
                            'repo_name': row.get('repo_name', ''),
                            'path': row.get('path', ''),
                            'language': row.get('language', '')
                        }
                        needed_remaining.remove(idx)
                        
                        # Progress update every 1000 found
                        if len(mapping) % 1000 == 0:
                            print(f"Found {len(mapping)}/{len(needed_indices)} needed indices...")
                        
                        # Early exit if we found everything
                        if len(needed_remaining) == 0:
                            print(f"Found all needed indices at row {i}!")
                            break
                except (ValueError, KeyError):
                    pass
    
    print(f"Loaded metadata for {len(mapping)} indices")
    print(f"Missing: {len(needed_indices) - len(mapping)} indices")
    
    if needed_remaining:
        print(f"Example missing indices: {list(needed_remaining)[:10]}")
    
    return mapping


def enrich_programming_problems(input_jsonl, output_jsonl, metadata_mapping, row_to_original):
    """
    Enrich programming_problems.jsonl with metadata.
    """
    print("\nStep 3: Enriching JSONL file...")
    
    matched_count = 0
    unmatched_count = 0
    
    with open(input_jsonl, 'r', encoding='utf-8') as f_in, \
         open(output_jsonl, 'w', encoding='utf-8') as f_out:
        
        for line in tqdm(f_in, desc="Processing JSONL", total=22532):
            data = json.loads(line.strip())
            row_number = data.get('row_number')
            
            if row_number in row_to_original:
                original_index = row_to_original[row_number]
                
                if original_index in metadata_mapping:
                    enrichment = metadata_mapping[original_index]
                    data['metadata']['repo_name'] = enrichment['repo_name']
                    data['metadata']['path'] = enrichment['path']
                    data['metadata']['language'] = enrichment['language']
                    matched_count += 1
                else:
                    unmatched_count += 1
            else:
                unmatched_count += 1
            
            f_out.write(json.dumps(data, ensure_ascii=False) + '\n')
    
    return matched_count, unmatched_count


def main():
    enhanced_csv = 'enhanced_dataset.csv'
    function_csv = 'function_dataset_v2.csv'
    input_jsonl = 'programming_problems.jsonl'
    output_jsonl = 'programming_problems_enriched.jsonl'
    
    # Step 1: Determine what we need
    row_to_original, needed_indices = get_needed_original_indices(function_csv, input_jsonl)
    
    # Step 2: Load only what we need
    metadata_mapping = load_needed_metadata(enhanced_csv, needed_indices)
    
    # Step 3: Enrich the JSONL
    matched, unmatched = enrich_programming_problems(input_jsonl, output_jsonl,
                                                     metadata_mapping, row_to_original)
    
    print(f"\n{'='*60}")
    print(f"✅ Enrichment complete!")
    print(f"{'='*60}")
    print(f"Output written to: {output_jsonl}")
    print(f"Matched: {matched}")
    print(f"Unmatched: {unmatched}")
    print(f"Total: {matched + unmatched}")
    print(f"Match rate: {matched / (matched + unmatched) * 100:.1f}%")
    
    return 0


if __name__ == '__main__':
    sys.exit(main())