code-comment-classifier / scripts /generate_data.py

Snaseem2026

Upload folder using huggingface_hub

7762e8f verified 5 days ago

7.35 kB

	"""
	Generate synthetic training data for Code Comment Quality Classifier
	"""
	import pandas as pd
	import os
	import random


	# Example comments for each category
	EXCELLENT_COMMENTS = [
	"This function calculates the Fibonacci sequence using dynamic programming to avoid redundant calculations. Time complexity: O(n), Space complexity: O(n)",
	"Validates user input against SQL injection attacks using parameterized queries. Returns True if safe, False otherwise. Raises ValueError for invalid input types.",
	"Binary search implementation for sorted arrays. Uses divide-and-conquer approach. Params: arr (sorted list), target (value). Returns: index or -1 if not found.",
	"Implements the Singleton pattern to ensure only one instance of DatabaseConnection exists. Thread-safe using double-checked locking.",
	"Parses JSON configuration file and validates against schema. Handles nested objects and arrays. Raises ConfigurationError if validation fails.",
	"Asynchronous HTTP request handler with retry logic and exponential backoff. Max retries: 3. Timeout: 30s. Returns: Response object or None on failure.",
	"Generates secure random tokens for authentication using CSPRNG. Length: 32 bytes. Returns: hex-encoded string. Used in password reset flows.",
	"Custom hook that debounces state updates to prevent excessive re-renders. Delay: configurable ms. Returns: debounced value and setter function.",
	"Optimized matrix multiplication using Strassen's algorithm. Suitable for large matrices (n > 64). Time complexity: O(n^2.807).",
	"Decorator that caches function results with LRU eviction policy. Max size: 128 entries. Thread-safe. Improves performance for expensive computations.",
	]

	HELPFUL_COMMENTS = [
	"Calculates the sum of two numbers and returns the result",
	"This function sorts the array in ascending order",
	"Checks if the user is logged in before proceeding",
	"Converts temperature from Celsius to Fahrenheit",
	"Returns the current timestamp in UTC format",
	"Validates email format using regex pattern",
	"Fetches user data from the database by ID",
	"Updates the UI when data changes",
	"Handles file upload and saves to storage",
	"Generates a random string of specified length",
	"Removes duplicates from the list",
	"Encrypts password before storing in database",
	"Sends email notification to user",
	"Formats date string for display",
	"Calculates total price including tax",
	]

	UNCLEAR_COMMENTS = [
	"does stuff",
	"magic happens here",
	"don't touch this",
	"idk why this works but it does",
	"temporary solution",
	"quick fix",
	"handles things",
	"processes data",
	"important function",
	"legacy code",
	"weird edge case",
	"not sure what this does",
	"complicated logic",
	"TODO",
	"fix me",
	"helper method",
	"utility function",
	"wrapper",
	"handler",
	"manager",
	]

	OUTDATED_COMMENTS = [
	"DEPRECATED: Use the new API endpoint instead",
	"This will be removed in version 2.0",
	"TODO: Refactor this to use async/await",
	"Old implementation - kept for backwards compatibility",
	"NOTE: This approach is no longer recommended",
	"FIXME: Memory leak issue - needs update",
	"Uses legacy authentication system",
	"WARNING: This method is obsolete",
	"Replaced by getUserInfo() in v1.5",
	"Temporary workaround - pending proper fix",
	"DEPRECATED: Direct database access - use ORM instead",
	"Old validation logic - update to new schema",
	"Uses outdated library - migrate to modern alternative",
	"This was for Python 2 compatibility",
	"FIXME: Security vulnerability - needs immediate update",
	]


	def generate_variations(base_comments: list, num_variations: int = 5) -> list:
	"""Generate variations of base comments to increase dataset size."""
	variations = []

	prefixes = ["", "Note: ", "Important: ", "Info: ", ""]
	suffixes = ["", ".", "...", " // end", ""]

	for comment in base_comments:
	variations.append(comment)
	for _ in range(num_variations - 1):
	prefix = random.choice(prefixes)
	suffix = random.choice(suffixes)
	varied = f"{prefix}{comment}{suffix}"
	variations.append(varied)

	return variations


	def generate_dataset(output_path: str = "./data/comments.csv", samples_per_class: int = 250):
	"""
	Generate synthetic training dataset.

	Args:
	output_path: Path to save the CSV file
	samples_per_class: Number of samples to generate per class
	"""
	print("=" * 60)
	print("Generating Synthetic Training Data")
	print("=" * 60)

	# Create data directory if it doesn't exist
	os.makedirs(os.path.dirname(output_path), exist_ok=True)

	# Generate variations
	print("\nGenerating comment variations...")
	excellent_samples = generate_variations(EXCELLENT_COMMENTS, samples_per_class // len(EXCELLENT_COMMENTS))
	helpful_samples = generate_variations(HELPFUL_COMMENTS, samples_per_class // len(HELPFUL_COMMENTS))
	unclear_samples = generate_variations(UNCLEAR_COMMENTS, samples_per_class // len(UNCLEAR_COMMENTS))
	outdated_samples = generate_variations(OUTDATED_COMMENTS, samples_per_class // len(OUTDATED_COMMENTS))

	# Ensure we have exactly samples_per_class for each
	excellent_samples = excellent_samples[:samples_per_class]
	helpful_samples = helpful_samples[:samples_per_class]
	unclear_samples = unclear_samples[:samples_per_class]
	outdated_samples = outdated_samples[:samples_per_class]

	# Create DataFrame
	data = {
	'comment': (
	excellent_samples +
	helpful_samples +
	unclear_samples +
	outdated_samples
	),
	'label': (
	['excellent'] * len(excellent_samples) +
	['helpful'] * len(helpful_samples) +
	['unclear'] * len(unclear_samples) +
	['outdated'] * len(outdated_samples)
	)
	}

	df = pd.DataFrame(data)

	# Shuffle the dataset
	df = df.sample(frac=1, random_state=42).reset_index(drop=True)

	# Save to CSV
	df.to_csv(output_path, index=False)

	print(f"\n✓ Dataset generated successfully!")
	print(f"✓ Total samples: {len(df)}")
	print(f"✓ Saved to: {output_path}")

	print("\nClass distribution:")
	print(df['label'].value_counts().sort_index())

	print("\nSample comments:")
	print("-" * 60)
	for label in ['excellent', 'helpful', 'unclear', 'outdated']:
	sample = df[df['label'] == label].iloc[0]['comment']
	print(f"\n[{label.upper()}]")
	print(f" {sample}")

	print("\n" + "=" * 60)
	print("Data generation complete! 🎉")
	print("=" * 60)


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="Generate synthetic training data")
	parser.add_argument(
	"--output",
	type=str,
	default="./data/comments.csv",
	help="Output path for the CSV file"
	)
	parser.add_argument(
	"--samples-per-class",
	type=int,
	default=250,
	help="Number of samples to generate per class"
	)
	args = parser.parse_args()

	generate_dataset(args.output, args.samples_per_class)