Fanny1366 commited on
Commit
2201d3b
·
verified ·
1 Parent(s): 2e70dd6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -208
app.py CHANGED
@@ -1,108 +1,34 @@
1
- from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, pipeline
2
- from datasets import Dataset
3
- from evaluate import load
4
  import numpy as np
5
  import torch
 
6
 
7
- # Step 1: Define your dataset
8
- # Replace with your actual dataset
9
- train_data = {
10
- "Review": [
11
- "This product is excellent, I love it!",
12
- "Terrible experience, would not recommend.",
13
- "It's okay, not great, but not bad either."
14
- ],
15
- "labels": [2, 0, 1] # Assuming 0=Very Negative, 1=Negative, 2=Neutral, 3=Positive, 4=Very Positive
16
- }
17
- eval_data = {
18
- "Review": [
19
- "Amazing quality, worth the price!",
20
- "Awful, completely disappointed."
21
- ],
22
- "labels": [4, 0]
23
- }
24
-
25
- # Convert datasets to Hugging Face Dataset format
26
- small_train_dataset = Dataset.from_dict(train_data)
27
- small_eval_dataset = Dataset.from_dict(eval_data)
28
-
29
- # Step 2: Load the model and tokenizer
30
- model = AutoModelForSequenceClassification.from_pretrained(
31
- "tabularisai/multilingual-sentiment-analysis",
32
- num_labels=5 # Ensure this matches the number of labels in your dataset
33
- )
34
- tokenizer = AutoTokenizer.from_pretrained(
35
- "tabularisai/multilingual-sentiment-analysis"
36
- )
37
-
38
- # Step 3: Tokenize the datasets
39
- def tokenize_function(examples):
40
- return tokenizer(examples["Review"], padding="max_length", truncation=True, max_length=128)
41
-
42
- tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
43
- tokenized_eval = small_eval_dataset.map(tokenize_function, batched=True)
44
-
45
- # Ensure the datasets have the "labels" column renamed correctly
46
- tokenized_train = tokenized_train.rename_column("labels", "label")
47
- tokenized_eval = tokenized_eval.rename_column("labels", "label")
48
-
49
- # Step 4: Define compute metrics function
50
- accuracy_metric = load("accuracy")
51
- def compute_metrics(eval_pred):
52
- logits, labels = eval_pred
53
- predictions = np.argmax(logits, axis=-1)
54
- return accuracy_metric.compute(predictions=predictions, references=labels)
55
-
56
- # Step 5: Configure training arguments
57
- training_args = TrainingArguments(
58
- output_dir="test_trainer",
59
- num_train_epochs=1, # Increased epochs for better learning
60
- per_device_train_batch_size=4, # Adjust batch size based on available GPU memory
61
- evaluation_strategy="epoch", # Evaluate after each epoch
62
- save_strategy="no", # Avoid saving checkpoints for simplicity
63
- learning_rate=5e-5, # Fine-tuned learning rate
64
- logging_dir="logs", # Log directory
65
- seed=42 # Ensure reproducibility
66
- )
67
-
68
- # Step 6: Set up the Trainer
69
- trainer = Trainer(
70
- model=model,
71
- args=training_args,
72
- train_dataset=tokenized_train,
73
- eval_dataset=tokenized_eval,
74
- compute_metrics=compute_metrics
75
- )
76
-
77
- # Debug: Ensure reproducibility
78
- np.random.seed(42)
79
- torch.manual_seed(42)
80
- if torch.cuda.is_available():
81
- torch.cuda.manual_seed(42)
82
-
83
- # Step 7: Train and Evaluate
84
  try:
85
- print("Training the model...")
86
- trainer.train() # Train the model
87
- print("Evaluating the model...")
88
- eval_results = trainer.evaluate() # Evaluate the model
89
- print("Evaluation Results:", eval_results)
90
- except RuntimeError as e:
91
- print("RuntimeError occurred:", str(e))
92
-
93
- # Step 8: Use pipeline for quick testing
94
- print("\nPipeline Testing:")
95
- sentiment_pipeline = pipeline(
96
- "text-classification",
97
- model=model,
98
- tokenizer=tokenizer
99
- )
100
-
101
- # Example test case
102
- text = "No commercials, and no adds no need for wifi it can use the satellite radio station to pick up or at least that's how it looks"
103
- result = sentiment_pipeline(text)
 
104
 
105
- # Correct label map based on model outputs
106
  label_map = {
107
  "Very Negative": 0,
108
  "Negative": 1,
@@ -111,115 +37,32 @@ label_map = {
111
  "Very Positive": 4
112
  }
113
 
114
- # Map the predicted label to its numeric equivalent
115
- predicted_label = label_map[result[0]['label']]
116
- confidence = result[0]['score']
117
-
118
- print(f"Text: {text}")
119
- print(f"Predicted label: {predicted_label} ({result[0]['label']})")
120
- print(f"Confidence: {confidence:.4f}")
121
-
122
- # Batch Testing
123
- examples = [
124
- {"text": "The stock market showed a strong recovery today.", "label": 4},
125
- {"text": "The company's performance is a disaster!", "label": 0},
126
- {"text": "It's a stable investment with consistent returns.", "label": 2}
127
- ]
128
-
129
- print("\nBatch Testing:")
130
- for example in examples:
131
- result = sentiment_pipeline(example["text"])
132
- predicted_label = label_map[result[0]['label']] # Map the model's output
133
- print(f"Text: {example['text'][:50]}...")
134
- print(f"True: {example['label']} | Predicted: {predicted_label} ({result[0]['label']}) | Confidence: {result[0]['score']:.2f}")
135
- print("-" * 60)
136
 
 
 
137
 
138
- # The second pipeline: Text Extraction
139
- # Installation
140
-
141
-
142
- from transformers import pipeline
143
- from keybert import KeyBERT
144
- from collections import defaultdict
145
- import re
146
-
147
- # Initialize the model (the first run will download automatically)
148
- kw_model = KeyBERT() # Keyword extraction
149
- classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") # Requirement classification
150
-
151
- # Sample Spotify review data
152
- reviews = [
153
- {"text": "Love the Discover Weekly feature but ads are too frequent.", "rating": 4},
154
- {"text": "App crashes every time I save a playlist.", "rating": 1},
155
- {"text": "Please add a sleep timer option!", "rating": 5},
156
- {"text": "Lyrics are out of sync with the music.", "rating": 2},
157
- ]
158
-
159
- # Predefined requirements category tags
160
- demand_labels = [
161
- "feature request", # Function Request
162
- "bug report", # Question feedback
163
- "content issue", # Content issues (e.g., lyrics)
164
- "subscription", # Subscription related
165
- "general feedback" # General feedback
166
- ]
167
-
168
- def analyze_reviews(reviews):
169
- results = []
170
- for review in reviews:
171
- text = review["text"]
172
- rating = review["rating"]
173
-
174
- # 1. Keyword Extraction (KeyBERT)
175
  keywords = kw_model.extract_keywords(
176
- text,
177
- keyphrase_ngram_range=(1, 2), # Extract 1-2 word combinations
178
- stop_words="english", # Filter stop words
179
- top_n=3 # Return the first 3 keywords
180
  )
181
- keywords = [kw[0] for kw in keywords] # Extract keyword text
182
-
183
- #2. Requirement Classification (Zero-shot)
184
- demand_result = classifier(text, demand_labels)
185
- primary_demand = demand_result["labels"][0] # Most likely type of requirement
186
-
187
- # 3. Adjust the urgency according to the rating
188
- urgency = "low"
189
- if rating <= 2:
190
- urgency = "high"
191
- elif rating <= 4:
192
- urgency = "medium"
193
-
194
- # Structured results
195
- results.append({
196
- "text": text,
197
- "rating": rating,
198
- "keywords": keywords,
199
- "demand_type": primary_demand,
200
- "urgency": urgency
201
- })
202
- return results
203
-
204
- # Execution analysis
205
- analysis_results = analyze_reviews(reviews)
206
-
207
- # Print
208
- for i, result in enumerate(analysis_results, 1):
209
- print(f"\nReview {i}:")
210
- print(f"Text: {result['text']}")
211
- print(f"Rating: {result['rating']}/5")
212
- print(f"Keywords: {', '.join(result['keywords'])}")
213
- print(f"Demand Type: {result['demand_type']}")
214
- print(f"Urgency: {result['urgency']}")
215
-
216
-
217
-
218
- import streamlit as st
219
-
220
- # 创建一个文本输入框
221
- input_text = st.text_input("请输入文本")
222
-
223
- # 显示输入的文本
224
- if input_text:
225
- st.write(f"你输入的文本是: {input_text}")
 
1
+ import streamlit as st
2
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
3
+ from keybert import KeyBERT
4
  import numpy as np
5
  import torch
6
+ from evaluate import load
7
 
8
+ # 加载情感分析模型和分词器
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  try:
10
+ model = AutoModelForSequenceClassification.from_pretrained(
11
+ "tabularisai/multilingual-sentiment-analysis",
12
+ num_labels=5
13
+ )
14
+ tokenizer = AutoTokenizer.from_pretrained(
15
+ "tabularisai/multilingual-sentiment-analysis"
16
+ )
17
+ sentiment_pipeline = pipeline(
18
+ "text-classification",
19
+ model=model,
20
+ tokenizer=tokenizer
21
+ )
22
+ except Exception as e:
23
+ st.error(f"加载情感分析模型时出错: {e}")
24
+
25
+ # 加载关键词提取模型
26
+ try:
27
+ kw_model = KeyBERT()
28
+ except Exception as e:
29
+ st.error(f"加载关键词提取模型时出错: {e}")
30
 
31
+ # 定义标签映射
32
  label_map = {
33
  "Very Negative": 0,
34
  "Negative": 1,
 
37
  "Very Positive": 4
38
  }
39
 
40
+ # Streamlit 应用标题
41
+ st.title("文本情感与关键词分析")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # 创建文本输入框
44
+ input_text = st.text_area("请输入评价性文本", "")
45
 
46
+ if input_text:
47
+ try:
48
+ # 情感分析
49
+ result = sentiment_pipeline(input_text)
50
+ predicted_label = label_map[result[0]['label']]
51
+ rating = predicted_label + 1 # 将标签转换为 1 - 5 的评分
52
+ confidence = result[0]['score']
53
+
54
+ # 关键词提取
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  keywords = kw_model.extract_keywords(
56
+ input_text,
57
+ keyphrase_ngram_range=(1, 2),
58
+ stop_words="english",
59
+ top_n=3
60
  )
61
+ keyword_text = [kw[0] for kw in keywords]
62
+
63
+ # 显示结果
64
+ st.write(f"情感评分: {rating}/5")
65
+ st.write(f"置信度: {confidence:.2f}")
66
+ st.write(f"关键词: {', '.join(keyword_text)}")
67
+ except Exception as e:
68
+ st.error(f"分析文本时出错: {e}")