Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,108 +1,34 @@
|
|
| 1 |
-
|
| 2 |
-
from
|
| 3 |
-
from
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
|
|
|
| 6 |
|
| 7 |
-
#
|
| 8 |
-
# Replace with your actual dataset
|
| 9 |
-
train_data = {
|
| 10 |
-
"Review": [
|
| 11 |
-
"This product is excellent, I love it!",
|
| 12 |
-
"Terrible experience, would not recommend.",
|
| 13 |
-
"It's okay, not great, but not bad either."
|
| 14 |
-
],
|
| 15 |
-
"labels": [2, 0, 1] # Assuming 0=Very Negative, 1=Negative, 2=Neutral, 3=Positive, 4=Very Positive
|
| 16 |
-
}
|
| 17 |
-
eval_data = {
|
| 18 |
-
"Review": [
|
| 19 |
-
"Amazing quality, worth the price!",
|
| 20 |
-
"Awful, completely disappointed."
|
| 21 |
-
],
|
| 22 |
-
"labels": [4, 0]
|
| 23 |
-
}
|
| 24 |
-
|
| 25 |
-
# Convert datasets to Hugging Face Dataset format
|
| 26 |
-
small_train_dataset = Dataset.from_dict(train_data)
|
| 27 |
-
small_eval_dataset = Dataset.from_dict(eval_data)
|
| 28 |
-
|
| 29 |
-
# Step 2: Load the model and tokenizer
|
| 30 |
-
model = AutoModelForSequenceClassification.from_pretrained(
|
| 31 |
-
"tabularisai/multilingual-sentiment-analysis",
|
| 32 |
-
num_labels=5 # Ensure this matches the number of labels in your dataset
|
| 33 |
-
)
|
| 34 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 35 |
-
"tabularisai/multilingual-sentiment-analysis"
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
# Step 3: Tokenize the datasets
|
| 39 |
-
def tokenize_function(examples):
|
| 40 |
-
return tokenizer(examples["Review"], padding="max_length", truncation=True, max_length=128)
|
| 41 |
-
|
| 42 |
-
tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
|
| 43 |
-
tokenized_eval = small_eval_dataset.map(tokenize_function, batched=True)
|
| 44 |
-
|
| 45 |
-
# Ensure the datasets have the "labels" column renamed correctly
|
| 46 |
-
tokenized_train = tokenized_train.rename_column("labels", "label")
|
| 47 |
-
tokenized_eval = tokenized_eval.rename_column("labels", "label")
|
| 48 |
-
|
| 49 |
-
# Step 4: Define compute metrics function
|
| 50 |
-
accuracy_metric = load("accuracy")
|
| 51 |
-
def compute_metrics(eval_pred):
|
| 52 |
-
logits, labels = eval_pred
|
| 53 |
-
predictions = np.argmax(logits, axis=-1)
|
| 54 |
-
return accuracy_metric.compute(predictions=predictions, references=labels)
|
| 55 |
-
|
| 56 |
-
# Step 5: Configure training arguments
|
| 57 |
-
training_args = TrainingArguments(
|
| 58 |
-
output_dir="test_trainer",
|
| 59 |
-
num_train_epochs=1, # Increased epochs for better learning
|
| 60 |
-
per_device_train_batch_size=4, # Adjust batch size based on available GPU memory
|
| 61 |
-
evaluation_strategy="epoch", # Evaluate after each epoch
|
| 62 |
-
save_strategy="no", # Avoid saving checkpoints for simplicity
|
| 63 |
-
learning_rate=5e-5, # Fine-tuned learning rate
|
| 64 |
-
logging_dir="logs", # Log directory
|
| 65 |
-
seed=42 # Ensure reproducibility
|
| 66 |
-
)
|
| 67 |
-
|
| 68 |
-
# Step 6: Set up the Trainer
|
| 69 |
-
trainer = Trainer(
|
| 70 |
-
model=model,
|
| 71 |
-
args=training_args,
|
| 72 |
-
train_dataset=tokenized_train,
|
| 73 |
-
eval_dataset=tokenized_eval,
|
| 74 |
-
compute_metrics=compute_metrics
|
| 75 |
-
)
|
| 76 |
-
|
| 77 |
-
# Debug: Ensure reproducibility
|
| 78 |
-
np.random.seed(42)
|
| 79 |
-
torch.manual_seed(42)
|
| 80 |
-
if torch.cuda.is_available():
|
| 81 |
-
torch.cuda.manual_seed(42)
|
| 82 |
-
|
| 83 |
-
# Step 7: Train and Evaluate
|
| 84 |
try:
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
| 104 |
|
| 105 |
-
#
|
| 106 |
label_map = {
|
| 107 |
"Very Negative": 0,
|
| 108 |
"Negative": 1,
|
|
@@ -111,115 +37,32 @@ label_map = {
|
|
| 111 |
"Very Positive": 4
|
| 112 |
}
|
| 113 |
|
| 114 |
-
#
|
| 115 |
-
|
| 116 |
-
confidence = result[0]['score']
|
| 117 |
-
|
| 118 |
-
print(f"Text: {text}")
|
| 119 |
-
print(f"Predicted label: {predicted_label} ({result[0]['label']})")
|
| 120 |
-
print(f"Confidence: {confidence:.4f}")
|
| 121 |
-
|
| 122 |
-
# Batch Testing
|
| 123 |
-
examples = [
|
| 124 |
-
{"text": "The stock market showed a strong recovery today.", "label": 4},
|
| 125 |
-
{"text": "The company's performance is a disaster!", "label": 0},
|
| 126 |
-
{"text": "It's a stable investment with consistent returns.", "label": 2}
|
| 127 |
-
]
|
| 128 |
-
|
| 129 |
-
print("\nBatch Testing:")
|
| 130 |
-
for example in examples:
|
| 131 |
-
result = sentiment_pipeline(example["text"])
|
| 132 |
-
predicted_label = label_map[result[0]['label']] # Map the model's output
|
| 133 |
-
print(f"Text: {example['text'][:50]}...")
|
| 134 |
-
print(f"True: {example['label']} | Predicted: {predicted_label} ({result[0]['label']}) | Confidence: {result[0]['score']:.2f}")
|
| 135 |
-
print("-" * 60)
|
| 136 |
|
|
|
|
|
|
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
# Initialize the model (the first run will download automatically)
|
| 148 |
-
kw_model = KeyBERT() # Keyword extraction
|
| 149 |
-
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") # Requirement classification
|
| 150 |
-
|
| 151 |
-
# Sample Spotify review data
|
| 152 |
-
reviews = [
|
| 153 |
-
{"text": "Love the Discover Weekly feature but ads are too frequent.", "rating": 4},
|
| 154 |
-
{"text": "App crashes every time I save a playlist.", "rating": 1},
|
| 155 |
-
{"text": "Please add a sleep timer option!", "rating": 5},
|
| 156 |
-
{"text": "Lyrics are out of sync with the music.", "rating": 2},
|
| 157 |
-
]
|
| 158 |
-
|
| 159 |
-
# Predefined requirements category tags
|
| 160 |
-
demand_labels = [
|
| 161 |
-
"feature request", # Function Request
|
| 162 |
-
"bug report", # Question feedback
|
| 163 |
-
"content issue", # Content issues (e.g., lyrics)
|
| 164 |
-
"subscription", # Subscription related
|
| 165 |
-
"general feedback" # General feedback
|
| 166 |
-
]
|
| 167 |
-
|
| 168 |
-
def analyze_reviews(reviews):
|
| 169 |
-
results = []
|
| 170 |
-
for review in reviews:
|
| 171 |
-
text = review["text"]
|
| 172 |
-
rating = review["rating"]
|
| 173 |
-
|
| 174 |
-
# 1. Keyword Extraction (KeyBERT)
|
| 175 |
keywords = kw_model.extract_keywords(
|
| 176 |
-
|
| 177 |
-
keyphrase_ngram_range=(1, 2),
|
| 178 |
-
stop_words="english",
|
| 179 |
-
top_n=3
|
| 180 |
)
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
#
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
if rating <= 2:
|
| 190 |
-
urgency = "high"
|
| 191 |
-
elif rating <= 4:
|
| 192 |
-
urgency = "medium"
|
| 193 |
-
|
| 194 |
-
# Structured results
|
| 195 |
-
results.append({
|
| 196 |
-
"text": text,
|
| 197 |
-
"rating": rating,
|
| 198 |
-
"keywords": keywords,
|
| 199 |
-
"demand_type": primary_demand,
|
| 200 |
-
"urgency": urgency
|
| 201 |
-
})
|
| 202 |
-
return results
|
| 203 |
-
|
| 204 |
-
# Execution analysis
|
| 205 |
-
analysis_results = analyze_reviews(reviews)
|
| 206 |
-
|
| 207 |
-
# Print
|
| 208 |
-
for i, result in enumerate(analysis_results, 1):
|
| 209 |
-
print(f"\nReview {i}:")
|
| 210 |
-
print(f"Text: {result['text']}")
|
| 211 |
-
print(f"Rating: {result['rating']}/5")
|
| 212 |
-
print(f"Keywords: {', '.join(result['keywords'])}")
|
| 213 |
-
print(f"Demand Type: {result['demand_type']}")
|
| 214 |
-
print(f"Urgency: {result['urgency']}")
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
import streamlit as st
|
| 219 |
-
|
| 220 |
-
# 创建一个文本输入框
|
| 221 |
-
input_text = st.text_input("请输入文本")
|
| 222 |
-
|
| 223 |
-
# 显示输入的文本
|
| 224 |
-
if input_text:
|
| 225 |
-
st.write(f"你输入的文本是: {input_text}")
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
| 3 |
+
from keybert import KeyBERT
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
| 6 |
+
from evaluate import load
|
| 7 |
|
| 8 |
+
# 加载情感分析模型和分词器
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
try:
|
| 10 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 11 |
+
"tabularisai/multilingual-sentiment-analysis",
|
| 12 |
+
num_labels=5
|
| 13 |
+
)
|
| 14 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 15 |
+
"tabularisai/multilingual-sentiment-analysis"
|
| 16 |
+
)
|
| 17 |
+
sentiment_pipeline = pipeline(
|
| 18 |
+
"text-classification",
|
| 19 |
+
model=model,
|
| 20 |
+
tokenizer=tokenizer
|
| 21 |
+
)
|
| 22 |
+
except Exception as e:
|
| 23 |
+
st.error(f"加载情感分析模型时出错: {e}")
|
| 24 |
+
|
| 25 |
+
# 加载关键词提取模型
|
| 26 |
+
try:
|
| 27 |
+
kw_model = KeyBERT()
|
| 28 |
+
except Exception as e:
|
| 29 |
+
st.error(f"加载关键词提取模型时出错: {e}")
|
| 30 |
|
| 31 |
+
# 定义标签映射
|
| 32 |
label_map = {
|
| 33 |
"Very Negative": 0,
|
| 34 |
"Negative": 1,
|
|
|
|
| 37 |
"Very Positive": 4
|
| 38 |
}
|
| 39 |
|
| 40 |
+
# Streamlit 应用标题
|
| 41 |
+
st.title("文本情感与关键词分析")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
# 创建文本输入框
|
| 44 |
+
input_text = st.text_area("请输入评价性文本", "")
|
| 45 |
|
| 46 |
+
if input_text:
|
| 47 |
+
try:
|
| 48 |
+
# 情感分析
|
| 49 |
+
result = sentiment_pipeline(input_text)
|
| 50 |
+
predicted_label = label_map[result[0]['label']]
|
| 51 |
+
rating = predicted_label + 1 # 将标签转换为 1 - 5 的评分
|
| 52 |
+
confidence = result[0]['score']
|
| 53 |
+
|
| 54 |
+
# 关键词提取
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
keywords = kw_model.extract_keywords(
|
| 56 |
+
input_text,
|
| 57 |
+
keyphrase_ngram_range=(1, 2),
|
| 58 |
+
stop_words="english",
|
| 59 |
+
top_n=3
|
| 60 |
)
|
| 61 |
+
keyword_text = [kw[0] for kw in keywords]
|
| 62 |
+
|
| 63 |
+
# 显示结果
|
| 64 |
+
st.write(f"情感评分: {rating}/5")
|
| 65 |
+
st.write(f"置信度: {confidence:.2f}")
|
| 66 |
+
st.write(f"关键词: {', '.join(keyword_text)}")
|
| 67 |
+
except Exception as e:
|
| 68 |
+
st.error(f"分析文本时出错: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|