Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,7 +10,9 @@ import os
|
|
| 10 |
from wordcloud import WordCloud
|
| 11 |
import matplotlib.pyplot as plt
|
| 12 |
import pkg_resources
|
| 13 |
-
import
|
|
|
|
|
|
|
| 14 |
|
| 15 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 16 |
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
|
|
@@ -52,16 +54,7 @@ def load_models():
|
|
| 52 |
return_all_scores=True
|
| 53 |
)
|
| 54 |
return tokenizer, bert_model, emotion_classifier
|
| 55 |
-
|
| 56 |
-
@st.cache_data
|
| 57 |
-
def cache_embeddings(text, _tokenizer, _model):
|
| 58 |
-
return get_embedding_for_text(text, _tokenizer, _model)
|
| 59 |
-
|
| 60 |
-
@st.cache_data
|
| 61 |
-
def cache_emotion_classification(text, _classifier):
|
| 62 |
-
return classify_emotion(text, _classifier)
|
| 63 |
|
| 64 |
-
@st.cache_data
|
| 65 |
def split_text(text, max_length=512):
|
| 66 |
"""Split text into chunks of maximum token length while preserving word boundaries."""
|
| 67 |
words = text.split()
|
|
@@ -84,6 +77,62 @@ def split_text(text, max_length=512):
|
|
| 84 |
chunks.append(' '.join(current_chunk))
|
| 85 |
|
| 86 |
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
def create_arabic_wordcloud(text, title):
|
| 89 |
wordcloud = WordCloud(
|
|
@@ -170,9 +219,9 @@ def classify_emotion(text, classifier):
|
|
| 170 |
return "LABEL_2"
|
| 171 |
|
| 172 |
def get_embedding_for_text(text, tokenizer, model):
|
|
|
|
| 173 |
chunks = split_text(text)
|
| 174 |
chunk_embeddings = []
|
| 175 |
-
embedding_size = model.config.hidden_size
|
| 176 |
|
| 177 |
for chunk in chunks:
|
| 178 |
try:
|
|
@@ -189,16 +238,18 @@ def get_embedding_for_text(text, tokenizer, model):
|
|
| 189 |
outputs = model(**inputs)
|
| 190 |
|
| 191 |
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
| 192 |
-
chunk_embeddings.append(embedding
|
| 193 |
except Exception as e:
|
|
|
|
| 194 |
continue
|
| 195 |
|
| 196 |
if chunk_embeddings:
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
|
|
|
| 202 |
def format_topics(topic_model, topic_counts):
|
| 203 |
"""Format topics for display."""
|
| 204 |
formatted_topics = []
|
|
@@ -233,17 +284,20 @@ def format_emotions(emotion_counts):
|
|
| 233 |
return formatted_emotions
|
| 234 |
|
| 235 |
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
|
|
|
|
| 236 |
summaries = []
|
| 237 |
-
embedding_size = bert_model.config.hidden_size
|
| 238 |
|
| 239 |
topic_model_params = {
|
| 240 |
"language": "arabic",
|
| 241 |
"calculate_probabilities": True,
|
| 242 |
-
"min_topic_size":
|
| 243 |
"n_gram_range": (1, 1),
|
| 244 |
"top_n_words": 15,
|
| 245 |
"verbose": True,
|
| 246 |
}
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
if topic_strategy == "Manual":
|
| 249 |
topic_model_params["nr_topics"] = n_topics
|
|
@@ -251,15 +305,12 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
| 251 |
topic_model_params["nr_topics"] = "auto"
|
| 252 |
|
| 253 |
topic_model = BERTopic(
|
| 254 |
-
embedding_model=
|
| 255 |
-
**topic_model_params
|
| 256 |
-
)
|
| 257 |
|
| 258 |
-
vectorizer = CountVectorizer(
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
max_df=1.0
|
| 262 |
-
)
|
| 263 |
topic_model.vectorizer_model = vectorizer
|
| 264 |
|
| 265 |
for country, group in df.groupby('country'):
|
|
@@ -268,48 +319,42 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
| 268 |
|
| 269 |
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
|
| 270 |
all_emotions = []
|
| 271 |
-
embeddings = []
|
| 272 |
|
| 273 |
-
|
| 274 |
for i, text in enumerate(texts):
|
| 275 |
try:
|
| 276 |
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
|
| 277 |
-
if embedding is not None and embedding.
|
| 278 |
embeddings.append(embedding)
|
| 279 |
-
|
| 280 |
-
|
|
|
|
| 281 |
except Exception as e:
|
| 282 |
-
st.warning(f"Error
|
| 283 |
continue
|
|
|
|
|
|
|
| 284 |
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
try:
|
| 295 |
-
emotion = classify_emotion(text, emotion_classifier)
|
| 296 |
-
all_emotions.append(emotion)
|
| 297 |
-
progress = 0.4 + ((i + 1) / len(texts) * 0.3)
|
| 298 |
-
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
|
| 299 |
-
except Exception as e:
|
| 300 |
-
st.warning(f"Error classifying emotion for poem {i+1} in {country}: {str(e)}")
|
| 301 |
-
continue
|
| 302 |
|
| 303 |
try:
|
|
|
|
| 304 |
if len(texts) < min_topic_size:
|
| 305 |
st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
|
| 306 |
continue
|
|
|
|
| 307 |
|
| 308 |
-
# Ensure texts and embeddings match
|
| 309 |
-
texts = texts[:len(embeddings)]
|
| 310 |
-
|
| 311 |
-
# Fit and transform the topic model
|
| 312 |
topics, probs = topic_model.fit_transform(texts, embeddings)
|
|
|
|
|
|
|
| 313 |
topic_counts = Counter(topics)
|
| 314 |
|
| 315 |
top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
|
|
@@ -329,7 +374,6 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
| 329 |
|
| 330 |
return summaries, topic_model
|
| 331 |
|
| 332 |
-
|
| 333 |
try:
|
| 334 |
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
| 335 |
st.success("Models loaded successfully!")
|
|
@@ -412,7 +456,7 @@ if uploaded_file is not None:
|
|
| 412 |
if summaries:
|
| 413 |
st.success("Analysis complete!")
|
| 414 |
|
| 415 |
-
tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
|
| 416 |
|
| 417 |
with tab1:
|
| 418 |
for summary in summaries:
|
|
@@ -445,6 +489,12 @@ if uploaded_file is not None:
|
|
| 445 |
words = topic_model.get_topic(row['Topic'])
|
| 446 |
topic_name = " | ".join([word for word, _ in words[:5]])
|
| 447 |
st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
except Exception as e:
|
| 450 |
st.error(f"Error processing file: {str(e)}")
|
|
|
|
| 10 |
from wordcloud import WordCloud
|
| 11 |
import matplotlib.pyplot as plt
|
| 12 |
import pkg_resources
|
| 13 |
+
import folium
|
| 14 |
+
import country_converter as coco
|
| 15 |
+
|
| 16 |
|
| 17 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 18 |
font_path = os.path.join(current_dir, "ArabicR2013-J25x.ttf")
|
|
|
|
| 54 |
return_all_scores=True
|
| 55 |
)
|
| 56 |
return tokenizer, bert_model, emotion_classifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
|
|
|
| 58 |
def split_text(text, max_length=512):
|
| 59 |
"""Split text into chunks of maximum token length while preserving word boundaries."""
|
| 60 |
words = text.split()
|
|
|
|
| 77 |
chunks.append(' '.join(current_chunk))
|
| 78 |
|
| 79 |
return chunks
|
| 80 |
+
|
| 81 |
+
def get_country_coordinates():
|
| 82 |
+
"""Returns dictionary of Arab country coordinates"""
|
| 83 |
+
return {
|
| 84 |
+
'Egypt': [26.8206, 30.8025],
|
| 85 |
+
'Saudi Arabia': [23.8859, 45.0792],
|
| 86 |
+
'UAE': [23.4241, 53.8478],
|
| 87 |
+
'Kuwait': [29.3117, 47.4818],
|
| 88 |
+
'Iraq': [33.2232, 43.6793],
|
| 89 |
+
'Syria': [34.8021, 38.9968],
|
| 90 |
+
'Lebanon': [33.8547, 35.8623],
|
| 91 |
+
'Jordan': [30.5852, 36.2384],
|
| 92 |
+
'Palestine': [31.9522, 35.2332],
|
| 93 |
+
'Yemen': [15.5527, 48.5164],
|
| 94 |
+
'Oman': [21.4735, 55.9754],
|
| 95 |
+
'Qatar': [25.3548, 51.1839],
|
| 96 |
+
'Bahrain': [26.0667, 50.5577],
|
| 97 |
+
'Sudan': [12.8628, 30.2176],
|
| 98 |
+
'Libya': [26.3351, 17.2283],
|
| 99 |
+
'Tunisia': [33.8869, 9.5375],
|
| 100 |
+
'Algeria': [28.0339, 1.6596],
|
| 101 |
+
'Morocco': [31.7917, -7.0926],
|
| 102 |
+
'Mauritania': [21.0079, -10.9408]
|
| 103 |
+
}
|
| 104 |
+
def create_topic_map(summaries):
|
| 105 |
+
"""Create an interactive map showing topic distribution"""
|
| 106 |
+
coordinates = get_country_coordinates()
|
| 107 |
+
|
| 108 |
+
# Create base map centered on Arab world
|
| 109 |
+
m = folium.Map(location=[25.0, 30.0], zoom_start=4)
|
| 110 |
+
|
| 111 |
+
for summary in summaries:
|
| 112 |
+
country = summary['country']
|
| 113 |
+
if country in coordinates:
|
| 114 |
+
# Get top topic
|
| 115 |
+
top_topic = summary['top_topics'][0]['topic'] if summary['top_topics'] else "No topics"
|
| 116 |
+
top_emotion = summary['top_emotions'][0]['emotion'] if summary['top_emotions'] else "No emotion"
|
| 117 |
+
|
| 118 |
+
# Create popup content
|
| 119 |
+
popup_content = f"""
|
| 120 |
+
<b>{country}</b><br>
|
| 121 |
+
Top Topic: {top_topic}<br>
|
| 122 |
+
Main Emotion: {top_emotion}<br>
|
| 123 |
+
Total Poems: {summary['total_poems']}
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
# Add marker
|
| 127 |
+
folium.CircleMarker(
|
| 128 |
+
location=coordinates[country],
|
| 129 |
+
radius=10,
|
| 130 |
+
popup=folium.Popup(popup_content, max_width=300),
|
| 131 |
+
color='red',
|
| 132 |
+
fill=True
|
| 133 |
+
).add_to(m)
|
| 134 |
+
|
| 135 |
+
return m
|
| 136 |
|
| 137 |
def create_arabic_wordcloud(text, title):
|
| 138 |
wordcloud = WordCloud(
|
|
|
|
| 219 |
return "LABEL_2"
|
| 220 |
|
| 221 |
def get_embedding_for_text(text, tokenizer, model):
|
| 222 |
+
"""Get embedding for complete text."""
|
| 223 |
chunks = split_text(text)
|
| 224 |
chunk_embeddings = []
|
|
|
|
| 225 |
|
| 226 |
for chunk in chunks:
|
| 227 |
try:
|
|
|
|
| 238 |
outputs = model(**inputs)
|
| 239 |
|
| 240 |
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
| 241 |
+
chunk_embeddings.append(embedding[0])
|
| 242 |
except Exception as e:
|
| 243 |
+
st.warning(f"Error processing chunk: {str(e)}")
|
| 244 |
continue
|
| 245 |
|
| 246 |
if chunk_embeddings:
|
| 247 |
+
weights = np.array([len(chunk.split()) for chunk in chunks])
|
| 248 |
+
weights = weights / weights.sum()
|
| 249 |
+
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
|
| 250 |
+
return weighted_embedding
|
| 251 |
+
return np.zeros(model.config.hidden_size)
|
| 252 |
+
|
| 253 |
def format_topics(topic_model, topic_counts):
|
| 254 |
"""Format topics for display."""
|
| 255 |
formatted_topics = []
|
|
|
|
| 284 |
return formatted_emotions
|
| 285 |
|
| 286 |
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=3):
|
| 287 |
+
"""Process the data and generate summaries with flexible topic configuration."""
|
| 288 |
summaries = []
|
|
|
|
| 289 |
|
| 290 |
topic_model_params = {
|
| 291 |
"language": "arabic",
|
| 292 |
"calculate_probabilities": True,
|
| 293 |
+
"min_topic_size": 3,
|
| 294 |
"n_gram_range": (1, 1),
|
| 295 |
"top_n_words": 15,
|
| 296 |
"verbose": True,
|
| 297 |
}
|
| 298 |
+
st.write(f"Total documents: {len(df)}")
|
| 299 |
+
st.write(f"Topic strategy: {topic_strategy}")
|
| 300 |
+
st.write(f"Min topic size: {min_topic_size}")
|
| 301 |
|
| 302 |
if topic_strategy == "Manual":
|
| 303 |
topic_model_params["nr_topics"] = n_topics
|
|
|
|
| 305 |
topic_model_params["nr_topics"] = "auto"
|
| 306 |
|
| 307 |
topic_model = BERTopic(
|
| 308 |
+
embedding_model=bert_model,
|
| 309 |
+
**topic_model_params)
|
|
|
|
| 310 |
|
| 311 |
+
vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS),
|
| 312 |
+
min_df=1,
|
| 313 |
+
max_df=1.0)
|
|
|
|
|
|
|
| 314 |
topic_model.vectorizer_model = vectorizer
|
| 315 |
|
| 316 |
for country, group in df.groupby('country'):
|
|
|
|
| 319 |
|
| 320 |
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
|
| 321 |
all_emotions = []
|
|
|
|
| 322 |
|
| 323 |
+
embeddings = []
|
| 324 |
for i, text in enumerate(texts):
|
| 325 |
try:
|
| 326 |
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
|
| 327 |
+
if embedding is not None and not np.isnan(embedding).any():
|
| 328 |
embeddings.append(embedding)
|
| 329 |
+
else:
|
| 330 |
+
st.warning(f"Invalid embedding generated for text {i+1} in {country}")
|
| 331 |
+
continue
|
| 332 |
except Exception as e:
|
| 333 |
+
st.warning(f"Error generating embedding for text {i+1} in {country}: {str(e)}")
|
| 334 |
continue
|
| 335 |
+
progress = (i + 1) / len(texts) * 0.4
|
| 336 |
+
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
|
| 337 |
|
| 338 |
+
if len(embeddings) != len(texts):
|
| 339 |
+
texts = texts[:len(embeddings)]
|
| 340 |
+
embeddings = np.array(embeddings)
|
| 341 |
+
|
| 342 |
+
for i, text in enumerate(texts):
|
| 343 |
+
emotion = classify_emotion(text, emotion_classifier)
|
| 344 |
+
all_emotions.append(emotion)
|
| 345 |
+
progress = 0.4 + ((i + 1) / len(texts) * 0.3)
|
| 346 |
+
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
|
| 348 |
try:
|
| 349 |
+
|
| 350 |
if len(texts) < min_topic_size:
|
| 351 |
st.warning(f"Not enough documents for {country} to generate meaningful topics (minimum {min_topic_size} required)")
|
| 352 |
continue
|
| 353 |
+
|
| 354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
topics, probs = topic_model.fit_transform(texts, embeddings)
|
| 356 |
+
|
| 357 |
+
|
| 358 |
topic_counts = Counter(topics)
|
| 359 |
|
| 360 |
top_topics = format_topics(topic_model, topic_counts.most_common(top_n))
|
|
|
|
| 374 |
|
| 375 |
return summaries, topic_model
|
| 376 |
|
|
|
|
| 377 |
try:
|
| 378 |
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
| 379 |
st.success("Models loaded successfully!")
|
|
|
|
| 456 |
if summaries:
|
| 457 |
st.success("Analysis complete!")
|
| 458 |
|
| 459 |
+
tab1, tab2, tab3 = st.tabs(["Country Summaries", "Global Topics", "Topic Map"])
|
| 460 |
|
| 461 |
with tab1:
|
| 462 |
for summary in summaries:
|
|
|
|
| 489 |
words = topic_model.get_topic(row['Topic'])
|
| 490 |
topic_name = " | ".join([word for word, _ in words[:5]])
|
| 491 |
st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
| 492 |
+
|
| 493 |
+
with tab3:
|
| 494 |
+
st.subheader("Topic Distribution Map")
|
| 495 |
+
topic_map = create_topic_map(summaries)
|
| 496 |
+
# Display the map
|
| 497 |
+
st.components.v1.html(topic_map._repr_html_(), height=600)
|
| 498 |
|
| 499 |
except Exception as e:
|
| 500 |
st.error(f"Error processing file: {str(e)}")
|