yazoniak commited on
Commit
fac038c
·
verified ·
1 Parent(s): 41c915d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -3
app.py CHANGED
@@ -4,11 +4,23 @@ Gradio app for Polish Twitter Emotion Classifier.
4
  This application provides an interactive interface for predicting emotions
5
  and sentiment in Polish text using a fine-tuned RoBERTa model.
6
 
7
- For private models, set the HF_TOKEN environment variable:
8
- export HF_TOKEN=your_huggingface_token
 
 
 
 
 
 
 
 
 
 
 
9
  """
10
 
11
  import gradio as gr
 
12
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
13
  import torch
14
  import numpy as np
@@ -25,6 +37,10 @@ DEFAULT_THRESHOLD = 0.5
25
  # Authentication token for private models
26
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
27
 
 
 
 
 
28
  # Emotion emojis for visual display
29
  LABEL_EMOJIS = {
30
  "radość": "😊",
@@ -106,6 +122,22 @@ model, tokenizer, labels, calibration_artifacts = load_model()
106
  print(f"✓ Model loaded successfully with {len(labels)} labels")
107
  print(f" Labels: {', '.join(labels)}")
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  def predict_emotions(
111
  text: str,
@@ -115,6 +147,8 @@ def predict_emotions(
115
  ) -> tuple[str, str]:
116
  """
117
  Predict emotions and sentiment for Polish text.
 
 
118
 
119
  Args:
120
  text: Input Polish text
@@ -278,6 +312,17 @@ def predict_emotions(
278
 
279
  all_scores_json = json.dumps(json_output, indent=2, ensure_ascii=False)
280
 
 
 
 
 
 
 
 
 
 
 
 
281
  return result_text, all_scores_json
282
 
283
 
@@ -343,7 +388,7 @@ with gr.Blocks(
343
  with gr.Accordion("Detailed JSON Output", open=False):
344
  json_output = gr.Code(label="Full Prediction Details", language="json")
345
 
346
- # Connect the button
347
  predict_btn.click(
348
  fn=predict_emotions,
349
  inputs=[text_input, mode_input, threshold_input, anonymize_input],
@@ -385,6 +430,7 @@ with gr.Blocks(
385
  - **Calibrated Mode** (Recommended): Uses temperature scaling and label-specific optimal thresholds for better accuracy and calibration. This mode is recommended for most use cases.
386
  - **Default Mode**: Uses sigmoid activation with a single threshold across all labels. Useful for quick predictions or when you want uniform threshold control.
387
 
 
388
  ### Limitations
389
 
390
  - Model is trained on Polish Twitter data and works best with informal social media text
@@ -408,6 +454,16 @@ with gr.Blocks(
408
  ### 📄 License
409
 
410
  GPL-3.0 License
 
 
 
 
 
 
 
 
 
 
411
  """)
412
 
413
 
 
4
  This application provides an interactive interface for predicting emotions
5
  and sentiment in Polish text using a fine-tuned RoBERTa model.
6
 
7
+ Environment Variables:
8
+ HF_TOKEN: HuggingFace authentication token (required for private models and auto-logging)
9
+ export HF_TOKEN=your_huggingface_token
10
+
11
+ HF_DATASET_REPO: HuggingFace dataset name for storing predictions (optional)
12
+ export HF_DATASET_REPO=your-username/predictions-dataset
13
+ Default: "twitter-emotion-pl-feedback"
14
+
15
+ Features:
16
+ - Multi-label emotion and sentiment classification
17
+ - Calibrated predictions with temperature scaling
18
+ - Automatic prediction logging via HuggingFaceDatasetSaver
19
+ - Persistent data storage across space restarts
20
  """
21
 
22
  import gradio as gr
23
+ from gradio.flagging import HuggingFaceDatasetSaver
24
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
25
  import torch
26
  import numpy as np
 
37
  # Authentication token for private models
38
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
39
 
40
+ # Flagging configuration - dataset for storing user feedback
41
+ # Set this to your desired dataset name, e.g. "your-username/model-feedback"
42
+ HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "twitter-emotion-pl-feedback")
43
+
44
  # Emotion emojis for visual display
45
  LABEL_EMOJIS = {
46
  "radość": "😊",
 
122
  print(f"✓ Model loaded successfully with {len(labels)} labels")
123
  print(f" Labels: {', '.join(labels)}")
124
 
125
+ # Initialize flagging callback for automatic prediction logging
126
+ flagging_callback = None
127
+ if HF_TOKEN:
128
+ try:
129
+ flagging_callback = HuggingFaceDatasetSaver(
130
+ hf_token=HF_TOKEN,
131
+ dataset_name=HF_DATASET_REPO,
132
+ private=True,
133
+ )
134
+ print(f"✓ Auto-logging enabled - all predictions will be saved to: {HF_DATASET_REPO}")
135
+ except Exception as e:
136
+ print(f"⚠ Could not initialize auto-logging: {e}")
137
+ print(" Predictions will not be logged")
138
+ else:
139
+ print("⚠ HF_TOKEN not set - auto-logging disabled")
140
+
141
 
142
  def predict_emotions(
143
  text: str,
 
147
  ) -> tuple[str, str]:
148
  """
149
  Predict emotions and sentiment for Polish text.
150
+
151
+ Automatically logs all predictions to HuggingFace dataset if flagging is enabled.
152
 
153
  Args:
154
  text: Input Polish text
 
312
 
313
  all_scores_json = json.dumps(json_output, indent=2, ensure_ascii=False)
314
 
315
+ # Automatically log all predictions if flagging is enabled
316
+ if flagging_callback:
317
+ try:
318
+ flagging_callback.flag(
319
+ flag_data=[text, mode, threshold, anonymize, result_text, all_scores_json],
320
+ flag_option="auto_logged",
321
+ username=None,
322
+ )
323
+ except Exception as e:
324
+ print(f"⚠ Error logging prediction: {e}")
325
+
326
  return result_text, all_scores_json
327
 
328
 
 
388
  with gr.Accordion("Detailed JSON Output", open=False):
389
  json_output = gr.Code(label="Full Prediction Details", language="json")
390
 
391
+ # Connect the predict button
392
  predict_btn.click(
393
  fn=predict_emotions,
394
  inputs=[text_input, mode_input, threshold_input, anonymize_input],
 
430
  - **Calibrated Mode** (Recommended): Uses temperature scaling and label-specific optimal thresholds for better accuracy and calibration. This mode is recommended for most use cases.
431
  - **Default Mode**: Uses sigmoid activation with a single threshold across all labels. Useful for quick predictions or when you want uniform threshold control.
432
 
433
+
434
  ### Limitations
435
 
436
  - Model is trained on Polish Twitter data and works best with informal social media text
 
454
  ### 📄 License
455
 
456
  GPL-3.0 License
457
+
458
+ ---
459
+
460
+ ### 📊 Data Collection Notice
461
+
462
+ This space automatically logs all predictions for model improvement and research purposes. The collected data includes:
463
+ - Input text and analysis settings
464
+ - Model predictions and confidence scores
465
+
466
+ All data is stored securely in a private HuggingFace dataset and used solely for improving the model's performance.
467
  """)
468
 
469