File size: 7,354 Bytes
7762e8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
"""
Generate synthetic training data for Code Comment Quality Classifier
"""
import pandas as pd
import os
import random
# Example comments for each category
EXCELLENT_COMMENTS = [
"This function calculates the Fibonacci sequence using dynamic programming to avoid redundant calculations. Time complexity: O(n), Space complexity: O(n)",
"Validates user input against SQL injection attacks using parameterized queries. Returns True if safe, False otherwise. Raises ValueError for invalid input types.",
"Binary search implementation for sorted arrays. Uses divide-and-conquer approach. Params: arr (sorted list), target (value). Returns: index or -1 if not found.",
"Implements the Singleton pattern to ensure only one instance of DatabaseConnection exists. Thread-safe using double-checked locking.",
"Parses JSON configuration file and validates against schema. Handles nested objects and arrays. Raises ConfigurationError if validation fails.",
"Asynchronous HTTP request handler with retry logic and exponential backoff. Max retries: 3. Timeout: 30s. Returns: Response object or None on failure.",
"Generates secure random tokens for authentication using CSPRNG. Length: 32 bytes. Returns: hex-encoded string. Used in password reset flows.",
"Custom hook that debounces state updates to prevent excessive re-renders. Delay: configurable ms. Returns: debounced value and setter function.",
"Optimized matrix multiplication using Strassen's algorithm. Suitable for large matrices (n > 64). Time complexity: O(n^2.807).",
"Decorator that caches function results with LRU eviction policy. Max size: 128 entries. Thread-safe. Improves performance for expensive computations.",
]
HELPFUL_COMMENTS = [
"Calculates the sum of two numbers and returns the result",
"This function sorts the array in ascending order",
"Checks if the user is logged in before proceeding",
"Converts temperature from Celsius to Fahrenheit",
"Returns the current timestamp in UTC format",
"Validates email format using regex pattern",
"Fetches user data from the database by ID",
"Updates the UI when data changes",
"Handles file upload and saves to storage",
"Generates a random string of specified length",
"Removes duplicates from the list",
"Encrypts password before storing in database",
"Sends email notification to user",
"Formats date string for display",
"Calculates total price including tax",
]
UNCLEAR_COMMENTS = [
"does stuff",
"magic happens here",
"don't touch this",
"idk why this works but it does",
"temporary solution",
"quick fix",
"handles things",
"processes data",
"important function",
"legacy code",
"weird edge case",
"not sure what this does",
"complicated logic",
"TODO",
"fix me",
"helper method",
"utility function",
"wrapper",
"handler",
"manager",
]
OUTDATED_COMMENTS = [
"DEPRECATED: Use the new API endpoint instead",
"This will be removed in version 2.0",
"TODO: Refactor this to use async/await",
"Old implementation - kept for backwards compatibility",
"NOTE: This approach is no longer recommended",
"FIXME: Memory leak issue - needs update",
"Uses legacy authentication system",
"WARNING: This method is obsolete",
"Replaced by getUserInfo() in v1.5",
"Temporary workaround - pending proper fix",
"DEPRECATED: Direct database access - use ORM instead",
"Old validation logic - update to new schema",
"Uses outdated library - migrate to modern alternative",
"This was for Python 2 compatibility",
"FIXME: Security vulnerability - needs immediate update",
]
def generate_variations(base_comments: list, num_variations: int = 5) -> list:
"""Generate variations of base comments to increase dataset size."""
variations = []
prefixes = ["", "Note: ", "Important: ", "Info: ", ""]
suffixes = ["", ".", "...", " // end", ""]
for comment in base_comments:
variations.append(comment)
for _ in range(num_variations - 1):
prefix = random.choice(prefixes)
suffix = random.choice(suffixes)
varied = f"{prefix}{comment}{suffix}"
variations.append(varied)
return variations
def generate_dataset(output_path: str = "./data/comments.csv", samples_per_class: int = 250):
"""
Generate synthetic training dataset.
Args:
output_path: Path to save the CSV file
samples_per_class: Number of samples to generate per class
"""
print("=" * 60)
print("Generating Synthetic Training Data")
print("=" * 60)
# Create data directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Generate variations
print("\nGenerating comment variations...")
excellent_samples = generate_variations(EXCELLENT_COMMENTS, samples_per_class // len(EXCELLENT_COMMENTS))
helpful_samples = generate_variations(HELPFUL_COMMENTS, samples_per_class // len(HELPFUL_COMMENTS))
unclear_samples = generate_variations(UNCLEAR_COMMENTS, samples_per_class // len(UNCLEAR_COMMENTS))
outdated_samples = generate_variations(OUTDATED_COMMENTS, samples_per_class // len(OUTDATED_COMMENTS))
# Ensure we have exactly samples_per_class for each
excellent_samples = excellent_samples[:samples_per_class]
helpful_samples = helpful_samples[:samples_per_class]
unclear_samples = unclear_samples[:samples_per_class]
outdated_samples = outdated_samples[:samples_per_class]
# Create DataFrame
data = {
'comment': (
excellent_samples +
helpful_samples +
unclear_samples +
outdated_samples
),
'label': (
['excellent'] * len(excellent_samples) +
['helpful'] * len(helpful_samples) +
['unclear'] * len(unclear_samples) +
['outdated'] * len(outdated_samples)
)
}
df = pd.DataFrame(data)
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Save to CSV
df.to_csv(output_path, index=False)
print(f"\n✓ Dataset generated successfully!")
print(f"✓ Total samples: {len(df)}")
print(f"✓ Saved to: {output_path}")
print("\nClass distribution:")
print(df['label'].value_counts().sort_index())
print("\nSample comments:")
print("-" * 60)
for label in ['excellent', 'helpful', 'unclear', 'outdated']:
sample = df[df['label'] == label].iloc[0]['comment']
print(f"\n[{label.upper()}]")
print(f" {sample}")
print("\n" + "=" * 60)
print("Data generation complete! 🎉")
print("=" * 60)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Generate synthetic training data")
parser.add_argument(
"--output",
type=str,
default="./data/comments.csv",
help="Output path for the CSV file"
)
parser.add_argument(
"--samples-per-class",
type=int,
default=250,
help="Number of samples to generate per class"
)
args = parser.parse_args()
generate_dataset(args.output, args.samples_per_class)
|