code-comment-classifier / scripts /generate_data.py
Snaseem2026's picture
Upload folder using huggingface_hub
7762e8f verified
"""
Generate synthetic training data for Code Comment Quality Classifier
"""
import pandas as pd
import os
import random
# Example comments for each category
EXCELLENT_COMMENTS = [
"This function calculates the Fibonacci sequence using dynamic programming to avoid redundant calculations. Time complexity: O(n), Space complexity: O(n)",
"Validates user input against SQL injection attacks using parameterized queries. Returns True if safe, False otherwise. Raises ValueError for invalid input types.",
"Binary search implementation for sorted arrays. Uses divide-and-conquer approach. Params: arr (sorted list), target (value). Returns: index or -1 if not found.",
"Implements the Singleton pattern to ensure only one instance of DatabaseConnection exists. Thread-safe using double-checked locking.",
"Parses JSON configuration file and validates against schema. Handles nested objects and arrays. Raises ConfigurationError if validation fails.",
"Asynchronous HTTP request handler with retry logic and exponential backoff. Max retries: 3. Timeout: 30s. Returns: Response object or None on failure.",
"Generates secure random tokens for authentication using CSPRNG. Length: 32 bytes. Returns: hex-encoded string. Used in password reset flows.",
"Custom hook that debounces state updates to prevent excessive re-renders. Delay: configurable ms. Returns: debounced value and setter function.",
"Optimized matrix multiplication using Strassen's algorithm. Suitable for large matrices (n > 64). Time complexity: O(n^2.807).",
"Decorator that caches function results with LRU eviction policy. Max size: 128 entries. Thread-safe. Improves performance for expensive computations.",
]
HELPFUL_COMMENTS = [
"Calculates the sum of two numbers and returns the result",
"This function sorts the array in ascending order",
"Checks if the user is logged in before proceeding",
"Converts temperature from Celsius to Fahrenheit",
"Returns the current timestamp in UTC format",
"Validates email format using regex pattern",
"Fetches user data from the database by ID",
"Updates the UI when data changes",
"Handles file upload and saves to storage",
"Generates a random string of specified length",
"Removes duplicates from the list",
"Encrypts password before storing in database",
"Sends email notification to user",
"Formats date string for display",
"Calculates total price including tax",
]
UNCLEAR_COMMENTS = [
"does stuff",
"magic happens here",
"don't touch this",
"idk why this works but it does",
"temporary solution",
"quick fix",
"handles things",
"processes data",
"important function",
"legacy code",
"weird edge case",
"not sure what this does",
"complicated logic",
"TODO",
"fix me",
"helper method",
"utility function",
"wrapper",
"handler",
"manager",
]
OUTDATED_COMMENTS = [
"DEPRECATED: Use the new API endpoint instead",
"This will be removed in version 2.0",
"TODO: Refactor this to use async/await",
"Old implementation - kept for backwards compatibility",
"NOTE: This approach is no longer recommended",
"FIXME: Memory leak issue - needs update",
"Uses legacy authentication system",
"WARNING: This method is obsolete",
"Replaced by getUserInfo() in v1.5",
"Temporary workaround - pending proper fix",
"DEPRECATED: Direct database access - use ORM instead",
"Old validation logic - update to new schema",
"Uses outdated library - migrate to modern alternative",
"This was for Python 2 compatibility",
"FIXME: Security vulnerability - needs immediate update",
]
def generate_variations(base_comments: list, num_variations: int = 5) -> list:
"""Generate variations of base comments to increase dataset size."""
variations = []
prefixes = ["", "Note: ", "Important: ", "Info: ", ""]
suffixes = ["", ".", "...", " // end", ""]
for comment in base_comments:
variations.append(comment)
for _ in range(num_variations - 1):
prefix = random.choice(prefixes)
suffix = random.choice(suffixes)
varied = f"{prefix}{comment}{suffix}"
variations.append(varied)
return variations
def generate_dataset(output_path: str = "./data/comments.csv", samples_per_class: int = 250):
"""
Generate synthetic training dataset.
Args:
output_path: Path to save the CSV file
samples_per_class: Number of samples to generate per class
"""
print("=" * 60)
print("Generating Synthetic Training Data")
print("=" * 60)
# Create data directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Generate variations
print("\nGenerating comment variations...")
excellent_samples = generate_variations(EXCELLENT_COMMENTS, samples_per_class // len(EXCELLENT_COMMENTS))
helpful_samples = generate_variations(HELPFUL_COMMENTS, samples_per_class // len(HELPFUL_COMMENTS))
unclear_samples = generate_variations(UNCLEAR_COMMENTS, samples_per_class // len(UNCLEAR_COMMENTS))
outdated_samples = generate_variations(OUTDATED_COMMENTS, samples_per_class // len(OUTDATED_COMMENTS))
# Ensure we have exactly samples_per_class for each
excellent_samples = excellent_samples[:samples_per_class]
helpful_samples = helpful_samples[:samples_per_class]
unclear_samples = unclear_samples[:samples_per_class]
outdated_samples = outdated_samples[:samples_per_class]
# Create DataFrame
data = {
'comment': (
excellent_samples +
helpful_samples +
unclear_samples +
outdated_samples
),
'label': (
['excellent'] * len(excellent_samples) +
['helpful'] * len(helpful_samples) +
['unclear'] * len(unclear_samples) +
['outdated'] * len(outdated_samples)
)
}
df = pd.DataFrame(data)
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Save to CSV
df.to_csv(output_path, index=False)
print(f"\n✓ Dataset generated successfully!")
print(f"✓ Total samples: {len(df)}")
print(f"✓ Saved to: {output_path}")
print("\nClass distribution:")
print(df['label'].value_counts().sort_index())
print("\nSample comments:")
print("-" * 60)
for label in ['excellent', 'helpful', 'unclear', 'outdated']:
sample = df[df['label'] == label].iloc[0]['comment']
print(f"\n[{label.upper()}]")
print(f" {sample}")
print("\n" + "=" * 60)
print("Data generation complete! 🎉")
print("=" * 60)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Generate synthetic training data")
parser.add_argument(
"--output",
type=str,
default="./data/comments.csv",
help="Output path for the CSV file"
)
parser.add_argument(
"--samples-per-class",
type=int,
default=250,
help="Number of samples to generate per class"
)
args = parser.parse_args()
generate_dataset(args.output, args.samples_per_class)