|
|
""" |
|
|
Generate synthetic training data for Code Comment Quality Classifier |
|
|
""" |
|
|
import pandas as pd |
|
|
import os |
|
|
import random |
|
|
|
|
|
|
|
|
|
|
|
EXCELLENT_COMMENTS = [ |
|
|
"This function calculates the Fibonacci sequence using dynamic programming to avoid redundant calculations. Time complexity: O(n), Space complexity: O(n)", |
|
|
"Validates user input against SQL injection attacks using parameterized queries. Returns True if safe, False otherwise. Raises ValueError for invalid input types.", |
|
|
"Binary search implementation for sorted arrays. Uses divide-and-conquer approach. Params: arr (sorted list), target (value). Returns: index or -1 if not found.", |
|
|
"Implements the Singleton pattern to ensure only one instance of DatabaseConnection exists. Thread-safe using double-checked locking.", |
|
|
"Parses JSON configuration file and validates against schema. Handles nested objects and arrays. Raises ConfigurationError if validation fails.", |
|
|
"Asynchronous HTTP request handler with retry logic and exponential backoff. Max retries: 3. Timeout: 30s. Returns: Response object or None on failure.", |
|
|
"Generates secure random tokens for authentication using CSPRNG. Length: 32 bytes. Returns: hex-encoded string. Used in password reset flows.", |
|
|
"Custom hook that debounces state updates to prevent excessive re-renders. Delay: configurable ms. Returns: debounced value and setter function.", |
|
|
"Optimized matrix multiplication using Strassen's algorithm. Suitable for large matrices (n > 64). Time complexity: O(n^2.807).", |
|
|
"Decorator that caches function results with LRU eviction policy. Max size: 128 entries. Thread-safe. Improves performance for expensive computations.", |
|
|
] |
|
|
|
|
|
HELPFUL_COMMENTS = [ |
|
|
"Calculates the sum of two numbers and returns the result", |
|
|
"This function sorts the array in ascending order", |
|
|
"Checks if the user is logged in before proceeding", |
|
|
"Converts temperature from Celsius to Fahrenheit", |
|
|
"Returns the current timestamp in UTC format", |
|
|
"Validates email format using regex pattern", |
|
|
"Fetches user data from the database by ID", |
|
|
"Updates the UI when data changes", |
|
|
"Handles file upload and saves to storage", |
|
|
"Generates a random string of specified length", |
|
|
"Removes duplicates from the list", |
|
|
"Encrypts password before storing in database", |
|
|
"Sends email notification to user", |
|
|
"Formats date string for display", |
|
|
"Calculates total price including tax", |
|
|
] |
|
|
|
|
|
UNCLEAR_COMMENTS = [ |
|
|
"does stuff", |
|
|
"magic happens here", |
|
|
"don't touch this", |
|
|
"idk why this works but it does", |
|
|
"temporary solution", |
|
|
"quick fix", |
|
|
"handles things", |
|
|
"processes data", |
|
|
"important function", |
|
|
"legacy code", |
|
|
"weird edge case", |
|
|
"not sure what this does", |
|
|
"complicated logic", |
|
|
"TODO", |
|
|
"fix me", |
|
|
"helper method", |
|
|
"utility function", |
|
|
"wrapper", |
|
|
"handler", |
|
|
"manager", |
|
|
] |
|
|
|
|
|
OUTDATED_COMMENTS = [ |
|
|
"DEPRECATED: Use the new API endpoint instead", |
|
|
"This will be removed in version 2.0", |
|
|
"TODO: Refactor this to use async/await", |
|
|
"Old implementation - kept for backwards compatibility", |
|
|
"NOTE: This approach is no longer recommended", |
|
|
"FIXME: Memory leak issue - needs update", |
|
|
"Uses legacy authentication system", |
|
|
"WARNING: This method is obsolete", |
|
|
"Replaced by getUserInfo() in v1.5", |
|
|
"Temporary workaround - pending proper fix", |
|
|
"DEPRECATED: Direct database access - use ORM instead", |
|
|
"Old validation logic - update to new schema", |
|
|
"Uses outdated library - migrate to modern alternative", |
|
|
"This was for Python 2 compatibility", |
|
|
"FIXME: Security vulnerability - needs immediate update", |
|
|
] |
|
|
|
|
|
|
|
|
def generate_variations(base_comments: list, num_variations: int = 5) -> list: |
|
|
"""Generate variations of base comments to increase dataset size.""" |
|
|
variations = [] |
|
|
|
|
|
prefixes = ["", "Note: ", "Important: ", "Info: ", ""] |
|
|
suffixes = ["", ".", "...", " // end", ""] |
|
|
|
|
|
for comment in base_comments: |
|
|
variations.append(comment) |
|
|
for _ in range(num_variations - 1): |
|
|
prefix = random.choice(prefixes) |
|
|
suffix = random.choice(suffixes) |
|
|
varied = f"{prefix}{comment}{suffix}" |
|
|
variations.append(varied) |
|
|
|
|
|
return variations |
|
|
|
|
|
|
|
|
def generate_dataset(output_path: str = "./data/comments.csv", samples_per_class: int = 250): |
|
|
""" |
|
|
Generate synthetic training dataset. |
|
|
|
|
|
Args: |
|
|
output_path: Path to save the CSV file |
|
|
samples_per_class: Number of samples to generate per class |
|
|
""" |
|
|
print("=" * 60) |
|
|
print("Generating Synthetic Training Data") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True) |
|
|
|
|
|
|
|
|
print("\nGenerating comment variations...") |
|
|
excellent_samples = generate_variations(EXCELLENT_COMMENTS, samples_per_class // len(EXCELLENT_COMMENTS)) |
|
|
helpful_samples = generate_variations(HELPFUL_COMMENTS, samples_per_class // len(HELPFUL_COMMENTS)) |
|
|
unclear_samples = generate_variations(UNCLEAR_COMMENTS, samples_per_class // len(UNCLEAR_COMMENTS)) |
|
|
outdated_samples = generate_variations(OUTDATED_COMMENTS, samples_per_class // len(OUTDATED_COMMENTS)) |
|
|
|
|
|
|
|
|
excellent_samples = excellent_samples[:samples_per_class] |
|
|
helpful_samples = helpful_samples[:samples_per_class] |
|
|
unclear_samples = unclear_samples[:samples_per_class] |
|
|
outdated_samples = outdated_samples[:samples_per_class] |
|
|
|
|
|
|
|
|
data = { |
|
|
'comment': ( |
|
|
excellent_samples + |
|
|
helpful_samples + |
|
|
unclear_samples + |
|
|
outdated_samples |
|
|
), |
|
|
'label': ( |
|
|
['excellent'] * len(excellent_samples) + |
|
|
['helpful'] * len(helpful_samples) + |
|
|
['unclear'] * len(unclear_samples) + |
|
|
['outdated'] * len(outdated_samples) |
|
|
) |
|
|
} |
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
|
|
|
df = df.sample(frac=1, random_state=42).reset_index(drop=True) |
|
|
|
|
|
|
|
|
df.to_csv(output_path, index=False) |
|
|
|
|
|
print(f"\n✓ Dataset generated successfully!") |
|
|
print(f"✓ Total samples: {len(df)}") |
|
|
print(f"✓ Saved to: {output_path}") |
|
|
|
|
|
print("\nClass distribution:") |
|
|
print(df['label'].value_counts().sort_index()) |
|
|
|
|
|
print("\nSample comments:") |
|
|
print("-" * 60) |
|
|
for label in ['excellent', 'helpful', 'unclear', 'outdated']: |
|
|
sample = df[df['label'] == label].iloc[0]['comment'] |
|
|
print(f"\n[{label.upper()}]") |
|
|
print(f" {sample}") |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Data generation complete! 🎉") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser(description="Generate synthetic training data") |
|
|
parser.add_argument( |
|
|
"--output", |
|
|
type=str, |
|
|
default="./data/comments.csv", |
|
|
help="Output path for the CSV file" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--samples-per-class", |
|
|
type=int, |
|
|
default=250, |
|
|
help="Number of samples to generate per class" |
|
|
) |
|
|
args = parser.parse_args() |
|
|
|
|
|
generate_dataset(args.output, args.samples_per_class) |
|
|
|