Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -53,17 +53,53 @@ def split_text(text, max_length=512):
|
|
| 53 |
|
| 54 |
return chunks
|
| 55 |
|
|
|
|
|
|
|
| 56 |
def classify_emotion(text, classifier):
|
| 57 |
-
"""Classify emotion for complete text."""
|
| 58 |
try:
|
| 59 |
# Split text into manageable chunks
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
all_scores = []
|
| 63 |
for chunk in chunks:
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
# Average scores across all chunks
|
| 69 |
if all_scores:
|
|
@@ -92,6 +128,7 @@ def classify_emotion(text, classifier):
|
|
| 92 |
st.warning(f"Error in emotion classification: {str(e)}")
|
| 93 |
return "LABEL_2" # Default to neutral
|
| 94 |
|
|
|
|
| 95 |
def get_embedding_for_text(text, tokenizer, model):
|
| 96 |
"""Get embedding for complete text."""
|
| 97 |
chunks = split_text(text)
|
|
@@ -301,7 +338,7 @@ else:
|
|
| 301 |
# Example format
|
| 302 |
st.write("### Expected File Format:")
|
| 303 |
example_df = pd.DataFrame({
|
| 304 |
-
'country': ['Egypt', '
|
| 305 |
-
'poem': ['قصيدة مصرية', 'قصيدة
|
| 306 |
})
|
| 307 |
st.dataframe(example_df)
|
|
|
|
| 53 |
|
| 54 |
return chunks
|
| 55 |
|
| 56 |
+
# The beginning of the code remains the same until the classify_emotion function
|
| 57 |
+
|
| 58 |
def classify_emotion(text, classifier):
|
| 59 |
+
"""Classify emotion for complete text with proper token handling."""
|
| 60 |
try:
|
| 61 |
# Split text into manageable chunks
|
| 62 |
+
words = text.split()
|
| 63 |
+
chunks = []
|
| 64 |
+
current_chunk = []
|
| 65 |
+
current_length = 0
|
| 66 |
+
|
| 67 |
+
# Create chunks that respect the 512 token limit
|
| 68 |
+
for word in words:
|
| 69 |
+
# Add word length plus 1 for space
|
| 70 |
+
word_tokens = len(classifier.tokenizer.encode(word))
|
| 71 |
+
if current_length + word_tokens > 512:
|
| 72 |
+
if current_chunk:
|
| 73 |
+
chunks.append(' '.join(current_chunk))
|
| 74 |
+
current_chunk = [word]
|
| 75 |
+
current_length = word_tokens
|
| 76 |
+
else:
|
| 77 |
+
current_chunk.append(word)
|
| 78 |
+
current_length += word_tokens
|
| 79 |
+
|
| 80 |
+
if current_chunk:
|
| 81 |
+
chunks.append(' '.join(current_chunk))
|
| 82 |
+
|
| 83 |
+
# If no chunks were created, use the original text with truncation
|
| 84 |
+
if not chunks:
|
| 85 |
+
chunks = [text]
|
| 86 |
|
| 87 |
all_scores = []
|
| 88 |
for chunk in chunks:
|
| 89 |
+
try:
|
| 90 |
+
# Ensure proper truncation
|
| 91 |
+
inputs = classifier.tokenizer(
|
| 92 |
+
chunk,
|
| 93 |
+
truncation=True,
|
| 94 |
+
max_length=512,
|
| 95 |
+
return_tensors="pt"
|
| 96 |
+
)
|
| 97 |
+
result = classifier(chunk, truncation=True, max_length=512)
|
| 98 |
+
scores = result[0]
|
| 99 |
+
all_scores.append(scores)
|
| 100 |
+
except Exception as chunk_error:
|
| 101 |
+
st.warning(f"Skipping chunk due to error: {str(chunk_error)}")
|
| 102 |
+
continue
|
| 103 |
|
| 104 |
# Average scores across all chunks
|
| 105 |
if all_scores:
|
|
|
|
| 128 |
st.warning(f"Error in emotion classification: {str(e)}")
|
| 129 |
return "LABEL_2" # Default to neutral
|
| 130 |
|
| 131 |
+
|
| 132 |
def get_embedding_for_text(text, tokenizer, model):
|
| 133 |
"""Get embedding for complete text."""
|
| 134 |
chunks = split_text(text)
|
|
|
|
| 338 |
# Example format
|
| 339 |
st.write("### Expected File Format:")
|
| 340 |
example_df = pd.DataFrame({
|
| 341 |
+
'country': ['Egypt', 'Palestine'],
|
| 342 |
+
'poem': ['قصيدة مصرية', 'قصيدة فلسطينية ']
|
| 343 |
})
|
| 344 |
st.dataframe(example_df)
|