Snaseem2026 commited on
Commit
7762e8f
·
verified ·
1 Parent(s): 4089b4a

Upload folder using huggingface_hub

Browse files
scripts/evaluate.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluation script for trained model with comprehensive analysis
3
+ """
4
+ import argparse
5
+ import sys
6
+ import os
7
+ import numpy as np
8
+ import pandas as pd
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
10
+
11
+ # Add parent directory to path
12
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
13
+
14
+ from src import (
15
+ load_config,
16
+ compute_metrics_factory,
17
+ plot_confusion_matrix,
18
+ print_classification_report
19
+ )
20
+ from src.data_loader import prepare_datasets_for_training
21
+
22
+
23
+ def analyze_errors(
24
+ test_dataset,
25
+ predictions: np.ndarray,
26
+ labels: np.ndarray,
27
+ id2label: dict,
28
+ tokenizer,
29
+ top_n: int = 10
30
+ ) -> pd.DataFrame:
31
+ """
32
+ Analyze misclassified examples.
33
+
34
+ Args:
35
+ test_dataset: Test dataset
36
+ predictions: Predicted labels
37
+ labels: True labels
38
+ id2label: Label mapping
39
+ tokenizer: Tokenizer to decode text
40
+ top_n: Number of examples to show per error type
41
+
42
+ Returns:
43
+ DataFrame with error analysis
44
+ """
45
+ errors = []
46
+ for i, (pred, true_label) in enumerate(zip(predictions, labels)):
47
+ if pred != true_label:
48
+ # Decode the comment (approximate, as original text is removed)
49
+ # Note: This is a limitation - we'd need to keep original text
50
+ errors.append({
51
+ 'index': i,
52
+ 'true_label': id2label[true_label],
53
+ 'predicted_label': id2label[pred],
54
+ 'error_type': f"{id2label[true_label]} -> {id2label[pred]}"
55
+ })
56
+
57
+ error_df = pd.DataFrame(errors)
58
+ if len(error_df) > 0:
59
+ print(f"\nError Analysis:")
60
+ print(f"Total errors: {len(error_df)}")
61
+ print(f"\nError type distribution:")
62
+ print(error_df['error_type'].value_counts())
63
+
64
+ return error_df
65
+
66
+
67
+ def evaluate_model(
68
+ model_path: str,
69
+ config_path: str = "config.yaml",
70
+ save_plots: bool = True
71
+ ):
72
+ """
73
+ Evaluate trained model on test set with comprehensive analysis.
74
+
75
+ Args:
76
+ model_path: Path to the trained model
77
+ config_path: Path to configuration file
78
+ save_plots: Whether to save visualization plots
79
+ """
80
+ print("=" * 60)
81
+ print("Model Evaluation")
82
+ print("=" * 60)
83
+
84
+ # Load config
85
+ config = load_config(config_path)
86
+
87
+ # Create output directory
88
+ output_dir = config['training'].get('output_dir', './results')
89
+ os.makedirs(output_dir, exist_ok=True)
90
+
91
+ # Load datasets
92
+ print("\n[1/5] Loading datasets...")
93
+ tokenized_datasets, label2id, id2label, _ = prepare_datasets_for_training(config_path)
94
+ test_dataset = tokenized_datasets['test']
95
+ print(f"✓ Test samples: {len(test_dataset)}")
96
+
97
+ # Load model and tokenizer
98
+ print("\n[2/5] Loading trained model...")
99
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
100
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
101
+ print(f"✓ Model loaded from {model_path}")
102
+
103
+ # Create trainer for evaluation
104
+ print("\n[3/5] Running evaluation...")
105
+ compute_metrics_fn = compute_metrics_factory(id2label)
106
+ trainer = Trainer(
107
+ model=model,
108
+ tokenizer=tokenizer,
109
+ compute_metrics=compute_metrics_fn
110
+ )
111
+
112
+ # Get predictions
113
+ predictions_output = trainer.predict(test_dataset)
114
+ predictions = np.argmax(predictions_output.predictions, axis=1)
115
+ labels = predictions_output.label_ids
116
+
117
+ # Print metrics
118
+ print("\n[4/5] Computing detailed metrics...")
119
+ print("\n" + "=" * 60)
120
+ print("Test Set Results")
121
+ print("=" * 60)
122
+
123
+ metrics = predictions_output.metrics
124
+
125
+ # Overall metrics
126
+ print("\nOverall Metrics:")
127
+ overall_metrics = ['accuracy', 'f1_weighted', 'f1_macro', 'precision_weighted', 'recall_weighted']
128
+ for metric in overall_metrics:
129
+ key = f'test_{metric}'
130
+ if key in metrics:
131
+ print(f" {metric.replace('_', ' ').title()}: {metrics[key]:.4f}")
132
+
133
+ # Per-class metrics
134
+ print("\nPer-Class Metrics:")
135
+ label_names = [id2label[i] for i in range(len(id2label))]
136
+ for label_name in label_names:
137
+ precision_key = f'test_precision_{label_name}'
138
+ recall_key = f'test_recall_{label_name}'
139
+ f1_key = f'test_f1_{label_name}'
140
+ if precision_key in metrics:
141
+ print(f"\n {label_name.upper()}:")
142
+ print(f" Precision: {metrics[precision_key]:.4f}")
143
+ print(f" Recall: {metrics[recall_key]:.4f}")
144
+ print(f" F1-Score: {metrics[f1_key]:.4f}")
145
+ print(f" Support: {metrics.get(f'test_support_{label_name}', 'N/A')}")
146
+
147
+ # Detailed classification report
148
+ print("\n" + "=" * 60)
149
+ print_classification_report(labels, predictions, label_names)
150
+
151
+ # Plot confusion matrix
152
+ print("\n[5/5] Generating visualizations...")
153
+ if save_plots:
154
+ plot_confusion_matrix(
155
+ labels,
156
+ predictions,
157
+ label_names,
158
+ save_path=os.path.join(output_dir, "confusion_matrix.png"),
159
+ normalize=False
160
+ )
161
+
162
+ # Also save normalized version
163
+ plot_confusion_matrix(
164
+ labels,
165
+ predictions,
166
+ label_names,
167
+ save_path=os.path.join(output_dir, "confusion_matrix_normalized.png"),
168
+ normalize=True
169
+ )
170
+
171
+ # Error analysis
172
+ error_df = analyze_errors(test_dataset, predictions, labels, id2label, tokenizer)
173
+ if len(error_df) > 0 and save_plots:
174
+ error_path = os.path.join(output_dir, "error_analysis.csv")
175
+ error_df.to_csv(error_path, index=False)
176
+ print(f"✓ Error analysis saved to {error_path}")
177
+
178
+ print("\n" + "=" * 60)
179
+ print("Evaluation Complete! 🎉")
180
+ print("=" * 60)
181
+ print(f"\nResults saved to: {output_dir}")
182
+
183
+
184
+ if __name__ == "__main__":
185
+ parser = argparse.ArgumentParser(description="Evaluate trained model")
186
+ parser.add_argument(
187
+ "--model-path",
188
+ type=str,
189
+ default="./results/final_model",
190
+ help="Path to the trained model"
191
+ )
192
+ parser.add_argument(
193
+ "--config",
194
+ type=str,
195
+ default="config.yaml",
196
+ help="Path to configuration file"
197
+ )
198
+ parser.add_argument(
199
+ "--no-plots",
200
+ action="store_true",
201
+ help="Skip generating visualization plots"
202
+ )
203
+ args = parser.parse_args()
204
+
205
+ evaluate_model(args.model_path, args.config, save_plots=not args.no_plots)
scripts/generate_data.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generate synthetic training data for Code Comment Quality Classifier
3
+ """
4
+ import pandas as pd
5
+ import os
6
+ import random
7
+
8
+
9
+ # Example comments for each category
10
+ EXCELLENT_COMMENTS = [
11
+ "This function calculates the Fibonacci sequence using dynamic programming to avoid redundant calculations. Time complexity: O(n), Space complexity: O(n)",
12
+ "Validates user input against SQL injection attacks using parameterized queries. Returns True if safe, False otherwise. Raises ValueError for invalid input types.",
13
+ "Binary search implementation for sorted arrays. Uses divide-and-conquer approach. Params: arr (sorted list), target (value). Returns: index or -1 if not found.",
14
+ "Implements the Singleton pattern to ensure only one instance of DatabaseConnection exists. Thread-safe using double-checked locking.",
15
+ "Parses JSON configuration file and validates against schema. Handles nested objects and arrays. Raises ConfigurationError if validation fails.",
16
+ "Asynchronous HTTP request handler with retry logic and exponential backoff. Max retries: 3. Timeout: 30s. Returns: Response object or None on failure.",
17
+ "Generates secure random tokens for authentication using CSPRNG. Length: 32 bytes. Returns: hex-encoded string. Used in password reset flows.",
18
+ "Custom hook that debounces state updates to prevent excessive re-renders. Delay: configurable ms. Returns: debounced value and setter function.",
19
+ "Optimized matrix multiplication using Strassen's algorithm. Suitable for large matrices (n > 64). Time complexity: O(n^2.807).",
20
+ "Decorator that caches function results with LRU eviction policy. Max size: 128 entries. Thread-safe. Improves performance for expensive computations.",
21
+ ]
22
+
23
+ HELPFUL_COMMENTS = [
24
+ "Calculates the sum of two numbers and returns the result",
25
+ "This function sorts the array in ascending order",
26
+ "Checks if the user is logged in before proceeding",
27
+ "Converts temperature from Celsius to Fahrenheit",
28
+ "Returns the current timestamp in UTC format",
29
+ "Validates email format using regex pattern",
30
+ "Fetches user data from the database by ID",
31
+ "Updates the UI when data changes",
32
+ "Handles file upload and saves to storage",
33
+ "Generates a random string of specified length",
34
+ "Removes duplicates from the list",
35
+ "Encrypts password before storing in database",
36
+ "Sends email notification to user",
37
+ "Formats date string for display",
38
+ "Calculates total price including tax",
39
+ ]
40
+
41
+ UNCLEAR_COMMENTS = [
42
+ "does stuff",
43
+ "magic happens here",
44
+ "don't touch this",
45
+ "idk why this works but it does",
46
+ "temporary solution",
47
+ "quick fix",
48
+ "handles things",
49
+ "processes data",
50
+ "important function",
51
+ "legacy code",
52
+ "weird edge case",
53
+ "not sure what this does",
54
+ "complicated logic",
55
+ "TODO",
56
+ "fix me",
57
+ "helper method",
58
+ "utility function",
59
+ "wrapper",
60
+ "handler",
61
+ "manager",
62
+ ]
63
+
64
+ OUTDATED_COMMENTS = [
65
+ "DEPRECATED: Use the new API endpoint instead",
66
+ "This will be removed in version 2.0",
67
+ "TODO: Refactor this to use async/await",
68
+ "Old implementation - kept for backwards compatibility",
69
+ "NOTE: This approach is no longer recommended",
70
+ "FIXME: Memory leak issue - needs update",
71
+ "Uses legacy authentication system",
72
+ "WARNING: This method is obsolete",
73
+ "Replaced by getUserInfo() in v1.5",
74
+ "Temporary workaround - pending proper fix",
75
+ "DEPRECATED: Direct database access - use ORM instead",
76
+ "Old validation logic - update to new schema",
77
+ "Uses outdated library - migrate to modern alternative",
78
+ "This was for Python 2 compatibility",
79
+ "FIXME: Security vulnerability - needs immediate update",
80
+ ]
81
+
82
+
83
+ def generate_variations(base_comments: list, num_variations: int = 5) -> list:
84
+ """Generate variations of base comments to increase dataset size."""
85
+ variations = []
86
+
87
+ prefixes = ["", "Note: ", "Important: ", "Info: ", ""]
88
+ suffixes = ["", ".", "...", " // end", ""]
89
+
90
+ for comment in base_comments:
91
+ variations.append(comment)
92
+ for _ in range(num_variations - 1):
93
+ prefix = random.choice(prefixes)
94
+ suffix = random.choice(suffixes)
95
+ varied = f"{prefix}{comment}{suffix}"
96
+ variations.append(varied)
97
+
98
+ return variations
99
+
100
+
101
+ def generate_dataset(output_path: str = "./data/comments.csv", samples_per_class: int = 250):
102
+ """
103
+ Generate synthetic training dataset.
104
+
105
+ Args:
106
+ output_path: Path to save the CSV file
107
+ samples_per_class: Number of samples to generate per class
108
+ """
109
+ print("=" * 60)
110
+ print("Generating Synthetic Training Data")
111
+ print("=" * 60)
112
+
113
+ # Create data directory if it doesn't exist
114
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
115
+
116
+ # Generate variations
117
+ print("\nGenerating comment variations...")
118
+ excellent_samples = generate_variations(EXCELLENT_COMMENTS, samples_per_class // len(EXCELLENT_COMMENTS))
119
+ helpful_samples = generate_variations(HELPFUL_COMMENTS, samples_per_class // len(HELPFUL_COMMENTS))
120
+ unclear_samples = generate_variations(UNCLEAR_COMMENTS, samples_per_class // len(UNCLEAR_COMMENTS))
121
+ outdated_samples = generate_variations(OUTDATED_COMMENTS, samples_per_class // len(OUTDATED_COMMENTS))
122
+
123
+ # Ensure we have exactly samples_per_class for each
124
+ excellent_samples = excellent_samples[:samples_per_class]
125
+ helpful_samples = helpful_samples[:samples_per_class]
126
+ unclear_samples = unclear_samples[:samples_per_class]
127
+ outdated_samples = outdated_samples[:samples_per_class]
128
+
129
+ # Create DataFrame
130
+ data = {
131
+ 'comment': (
132
+ excellent_samples +
133
+ helpful_samples +
134
+ unclear_samples +
135
+ outdated_samples
136
+ ),
137
+ 'label': (
138
+ ['excellent'] * len(excellent_samples) +
139
+ ['helpful'] * len(helpful_samples) +
140
+ ['unclear'] * len(unclear_samples) +
141
+ ['outdated'] * len(outdated_samples)
142
+ )
143
+ }
144
+
145
+ df = pd.DataFrame(data)
146
+
147
+ # Shuffle the dataset
148
+ df = df.sample(frac=1, random_state=42).reset_index(drop=True)
149
+
150
+ # Save to CSV
151
+ df.to_csv(output_path, index=False)
152
+
153
+ print(f"\n✓ Dataset generated successfully!")
154
+ print(f"✓ Total samples: {len(df)}")
155
+ print(f"✓ Saved to: {output_path}")
156
+
157
+ print("\nClass distribution:")
158
+ print(df['label'].value_counts().sort_index())
159
+
160
+ print("\nSample comments:")
161
+ print("-" * 60)
162
+ for label in ['excellent', 'helpful', 'unclear', 'outdated']:
163
+ sample = df[df['label'] == label].iloc[0]['comment']
164
+ print(f"\n[{label.upper()}]")
165
+ print(f" {sample}")
166
+
167
+ print("\n" + "=" * 60)
168
+ print("Data generation complete! 🎉")
169
+ print("=" * 60)
170
+
171
+
172
+ if __name__ == "__main__":
173
+ import argparse
174
+
175
+ parser = argparse.ArgumentParser(description="Generate synthetic training data")
176
+ parser.add_argument(
177
+ "--output",
178
+ type=str,
179
+ default="./data/comments.csv",
180
+ help="Output path for the CSV file"
181
+ )
182
+ parser.add_argument(
183
+ "--samples-per-class",
184
+ type=int,
185
+ default=250,
186
+ help="Number of samples to generate per class"
187
+ )
188
+ args = parser.parse_args()
189
+
190
+ generate_dataset(args.output, args.samples_per_class)
scripts/publish_to_hub.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comprehensive script to publish model and codebase to Hugging Face Hub
3
+ """
4
+ import argparse
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+ from huggingface_hub import HfApi, create_repo, upload_folder, upload_file
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+
11
+ # Add parent directory to path
12
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
13
+
14
+
15
+ def publish_to_hub(
16
+ model_path: str,
17
+ repo_id: str,
18
+ private: bool = False,
19
+ upload_code: bool = True,
20
+ upload_model: bool = True
21
+ ):
22
+ """
23
+ Publish model and codebase to Hugging Face Hub.
24
+
25
+ Args:
26
+ model_path: Path to the trained model
27
+ repo_id: Full repository ID (e.g., "username/repo-name")
28
+ private: Whether to make the repository private
29
+ upload_code: Whether to upload code files
30
+ upload_model: Whether to upload the model
31
+ """
32
+ print("=" * 70)
33
+ print("Publishing to Hugging Face Hub")
34
+ print("=" * 70)
35
+ print(f"\nRepository: {repo_id}")
36
+ print(f"Private: {private}")
37
+ print(f"Upload Model: {upload_model}")
38
+ print(f"Upload Code: {upload_code}")
39
+
40
+ api = HfApi()
41
+
42
+ # Create repository
43
+ print("\n[1/4] Creating/verifying repository...")
44
+ try:
45
+ create_repo(
46
+ repo_id=repo_id,
47
+ repo_type="model",
48
+ exist_ok=True,
49
+ private=private
50
+ )
51
+ print(f"✓ Repository ready: {repo_id}")
52
+ except Exception as e:
53
+ print(f"✗ Error creating repository: {e}")
54
+ print("\nMake sure you're logged in:")
55
+ print(" huggingface-cli login")
56
+ return False
57
+
58
+ # Upload model and tokenizer
59
+ if upload_model:
60
+ print("\n[2/4] Uploading model and tokenizer...")
61
+ try:
62
+ if not os.path.exists(model_path):
63
+ print(f"✗ Model path not found: {model_path}")
64
+ print(" Skipping model upload. You can upload it later.")
65
+ else:
66
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
67
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
68
+
69
+ model.push_to_hub(repo_id)
70
+ tokenizer.push_to_hub(repo_id)
71
+ print("✓ Model and tokenizer uploaded")
72
+ except Exception as e:
73
+ print(f"✗ Error uploading model: {e}")
74
+ print(" You can upload the model separately later.")
75
+ else:
76
+ print("\n[2/4] Skipping model upload (--no-model flag)")
77
+
78
+ # Upload code files
79
+ if upload_code:
80
+ print("\n[3/4] Uploading code files...")
81
+ try:
82
+ repo_root = Path(__file__).parent.parent
83
+
84
+ # Files to upload
85
+ code_files = [
86
+ "train.py",
87
+ "inference.py",
88
+ "config.yaml",
89
+ "requirements.txt",
90
+ "setup.py",
91
+ "README.md",
92
+ "MODEL_CARD.md",
93
+ "LICENSE",
94
+ ".gitignore"
95
+ ]
96
+
97
+ # Directories to upload
98
+ code_dirs = [
99
+ "src",
100
+ "scripts"
101
+ ]
102
+
103
+ uploaded_count = 0
104
+
105
+ # Upload individual files
106
+ for file_name in code_files:
107
+ file_path = repo_root / file_name
108
+ if file_path.exists():
109
+ try:
110
+ upload_file(
111
+ path_or_fileobj=str(file_path),
112
+ path_in_repo=file_name,
113
+ repo_id=repo_id,
114
+ repo_type="model"
115
+ )
116
+ print(f" ✓ Uploaded {file_name}")
117
+ uploaded_count += 1
118
+ except Exception as e:
119
+ print(f" ⚠ Could not upload {file_name}: {e}")
120
+
121
+ # Upload directories
122
+ for dir_name in code_dirs:
123
+ dir_path = repo_root / dir_name
124
+ if dir_path.exists() and dir_path.is_dir():
125
+ try:
126
+ upload_folder(
127
+ folder_path=str(dir_path),
128
+ path_in_repo=dir_name,
129
+ repo_id=repo_id,
130
+ repo_type="model",
131
+ ignore_patterns=["__pycache__", "*.pyc", ".DS_Store"]
132
+ )
133
+ print(f" ✓ Uploaded {dir_name}/")
134
+ uploaded_count += 1
135
+ except Exception as e:
136
+ print(f" ⚠ Could not upload {dir_name}/: {e}")
137
+
138
+ print(f"\n✓ Uploaded {uploaded_count} code files/directories")
139
+
140
+ except Exception as e:
141
+ print(f"✗ Error uploading code: {e}")
142
+ else:
143
+ print("\n[3/4] Skipping code upload (--no-code flag)")
144
+
145
+ # Final summary
146
+ print("\n[4/4] Publishing complete!")
147
+ print("\n" + "=" * 70)
148
+ print("Success! 🎉")
149
+ print("=" * 70)
150
+ print(f"\nYour model is now available at:")
151
+ print(f"https://huggingface.co/{repo_id}")
152
+
153
+ if upload_model:
154
+ print("\nTo use your model:")
155
+ print(f"""
156
+ from transformers import pipeline
157
+
158
+ classifier = pipeline("text-classification", model="{repo_id}")
159
+
160
+ # Classify a comment
161
+ result = classifier("This function uses dynamic programming for O(n) time complexity")
162
+ print(result)
163
+ """)
164
+
165
+ return True
166
+
167
+
168
+ if __name__ == "__main__":
169
+ parser = argparse.ArgumentParser(
170
+ description="Publish model and codebase to Hugging Face Hub",
171
+ formatter_class=argparse.RawDescriptionHelpFormatter,
172
+ epilog="""
173
+ Examples:
174
+ # Publish everything (model + code)
175
+ python scripts/publish_to_hub.py --repo-id Snaseem2026/code-comment-classifier
176
+
177
+ # Publish only code (no model)
178
+ python scripts/publish_to_hub.py --repo-id Snaseem2026/code-comment-classifier --no-model
179
+
180
+ # Publish only model (no code)
181
+ python scripts/publish_to_hub.py --repo-id Snaseem2026/code-comment-classifier --no-code
182
+
183
+ # Private repository
184
+ python scripts/publish_to_hub.py --repo-id Snaseem2026/code-comment-classifier --private
185
+ """
186
+ )
187
+ parser.add_argument(
188
+ "--model-path",
189
+ type=str,
190
+ default="./results/final_model",
191
+ help="Path to the trained model"
192
+ )
193
+ parser.add_argument(
194
+ "--repo-id",
195
+ type=str,
196
+ default="Snaseem2026/code-comment-classifier",
197
+ help="Full repository ID (e.g., 'username/repo-name')"
198
+ )
199
+ parser.add_argument(
200
+ "--private",
201
+ action="store_true",
202
+ help="Make the repository private"
203
+ )
204
+ parser.add_argument(
205
+ "--no-code",
206
+ action="store_true",
207
+ help="Skip uploading code files"
208
+ )
209
+ parser.add_argument(
210
+ "--no-model",
211
+ action="store_true",
212
+ help="Skip uploading model files"
213
+ )
214
+ parser.add_argument(
215
+ "--yes",
216
+ action="store_true",
217
+ help="Skip confirmation prompt"
218
+ )
219
+
220
+ args = parser.parse_args()
221
+
222
+ print("\n" + "=" * 70)
223
+ print("Hugging Face Hub Publishing")
224
+ print("=" * 70)
225
+ print("\nBefore publishing, make sure you:")
226
+ print("1. Have a Hugging Face account")
227
+ print("2. Are logged in: huggingface-cli login")
228
+ print("3. Have reviewed MODEL_CARD.md and README.md")
229
+ print(f"4. Model path exists: {args.model_path} ({'✓' if os.path.exists(args.model_path) else '✗'})")
230
+
231
+ if not args.yes:
232
+ print("\n" + "=" * 70)
233
+ response = input(f"\nProceed with publishing to {args.repo_id}? (yes/no): ")
234
+ if response.lower() not in ['yes', 'y']:
235
+ print("Publishing cancelled.")
236
+ sys.exit(0)
237
+
238
+ success = publish_to_hub(
239
+ model_path=args.model_path,
240
+ repo_id=args.repo_id,
241
+ private=args.private,
242
+ upload_code=not args.no_code,
243
+ upload_model=not args.no_model
244
+ )
245
+
246
+ if not success:
247
+ sys.exit(1)
scripts/upload_to_hub.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Upload trained model to Hugging Face Hub
3
+ """
4
+ import argparse
5
+ import sys
6
+ import os
7
+ from huggingface_hub import HfApi, create_repo
8
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
+
10
+ # Add parent directory to path (if needed)
11
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
12
+
13
+
14
+ def upload_to_hub(
15
+ model_path: str,
16
+ repo_name: str,
17
+ organization: str = None,
18
+ private: bool = False
19
+ ):
20
+ """
21
+ Upload model to Hugging Face Hub.
22
+
23
+ Args:
24
+ model_path: Path to the trained model
25
+ repo_name: Name for the repository on Hugging Face Hub
26
+ organization: Organization name (optional)
27
+ private: Whether to make the repository private
28
+ """
29
+ print("=" * 60)
30
+ print("Uploading Model to Hugging Face Hub")
31
+ print("=" * 60)
32
+
33
+ # Create full repo ID
34
+ if organization:
35
+ repo_id = f"{organization}/{repo_name}"
36
+ else:
37
+ repo_id = repo_name
38
+
39
+ print(f"\nRepository: {repo_id}")
40
+ print(f"Private: {private}")
41
+
42
+ # Load model and tokenizer
43
+ print("\n[1/3] Loading model...")
44
+ try:
45
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
46
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
47
+ print("✓ Model loaded successfully")
48
+ except Exception as e:
49
+ print(f"✗ Error loading model: {e}")
50
+ return
51
+
52
+ # Create repository
53
+ print("\n[2/3] Creating repository...")
54
+ try:
55
+ create_repo(
56
+ repo_id=repo_id,
57
+ repo_type="model",
58
+ exist_ok=True,
59
+ private=private
60
+ )
61
+ print(f"✓ Repository created/verified: {repo_id}")
62
+ except Exception as e:
63
+ print(f"✗ Error creating repository: {e}")
64
+ print("\nMake sure you're logged in:")
65
+ print(" huggingface-cli login")
66
+ return
67
+
68
+ # Push to hub
69
+ print("\n[3/3] Uploading model and tokenizer...")
70
+ try:
71
+ model.push_to_hub(repo_id)
72
+ tokenizer.push_to_hub(repo_id)
73
+ print("✓ Upload complete!")
74
+ except Exception as e:
75
+ print(f"✗ Error uploading: {e}")
76
+ return
77
+
78
+ print("\n" + "=" * 60)
79
+ print("Success! 🎉")
80
+ print("=" * 60)
81
+ print(f"\nYour model is now available at:")
82
+ print(f"https://huggingface.co/{repo_id}")
83
+
84
+ print("\nTo use your model:")
85
+ print(f"""
86
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
87
+
88
+ tokenizer = AutoTokenizer.from_pretrained("{repo_id}")
89
+ model = AutoModelForSequenceClassification.from_pretrained("{repo_id}")
90
+ """)
91
+
92
+
93
+ if __name__ == "__main__":
94
+ parser = argparse.ArgumentParser(description="Upload model to Hugging Face Hub")
95
+ parser.add_argument(
96
+ "--model-path",
97
+ type=str,
98
+ default="./results/final_model",
99
+ help="Path to the trained model"
100
+ )
101
+ parser.add_argument(
102
+ "--repo-name",
103
+ type=str,
104
+ required=True,
105
+ help="Name for the repository on Hugging Face Hub"
106
+ )
107
+ parser.add_argument(
108
+ "--organization",
109
+ type=str,
110
+ default=None,
111
+ help="Organization name (optional)"
112
+ )
113
+ parser.add_argument(
114
+ "--private",
115
+ action="store_true",
116
+ help="Make the repository private"
117
+ )
118
+ args = parser.parse_args()
119
+
120
+ print("\nBefore uploading, make sure you:")
121
+ print("1. Have a Hugging Face account")
122
+ print("2. Are logged in: huggingface-cli login")
123
+ print("3. Have reviewed the model card (MODEL_CARD.md)")
124
+
125
+ response = input("\nProceed with upload? (yes/no): ")
126
+ if response.lower() in ['yes', 'y']:
127
+ upload_to_hub(
128
+ args.model_path,
129
+ args.repo_name,
130
+ args.organization,
131
+ args.private
132
+ )
133
+ else:
134
+ print("Upload cancelled.")