Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -234,28 +234,29 @@ def classify_emotion(text, classifier):
|
|
| 234 |
return final_emotion
|
| 235 |
|
| 236 |
def get_embedding_for_text(text, tokenizer, model):
|
| 237 |
-
"""Get embedding for complete text."""
|
| 238 |
-
chunks
|
|
|
|
|
|
|
| 239 |
chunk_embeddings = []
|
| 240 |
|
| 241 |
for chunk in chunks:
|
| 242 |
-
inputs = tokenizer(
|
| 243 |
-
chunk,
|
| 244 |
return_tensors="pt",
|
| 245 |
-
padding=
|
| 246 |
-
truncation=True,
|
| 247 |
max_length=512
|
| 248 |
)
|
| 249 |
-
inputs =
|
| 250 |
|
| 251 |
with torch.no_grad():
|
| 252 |
-
|
| 253 |
-
outputs = model(**inputs)[0]
|
| 254 |
embedding = outputs[:, 0, :].cpu().numpy()
|
| 255 |
chunk_embeddings.append(embedding[0])
|
| 256 |
|
| 257 |
if chunk_embeddings:
|
| 258 |
-
|
|
|
|
| 259 |
weights = weights / weights.sum()
|
| 260 |
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
|
| 261 |
return weighted_embedding
|
|
|
|
| 234 |
return final_emotion
|
| 235 |
|
| 236 |
def get_embedding_for_text(text, tokenizer, model):
|
| 237 |
+
"""Get embedding for complete text while preserving all content."""
|
| 238 |
+
# Split into optimal chunks of exactly 512 tokens
|
| 239 |
+
tokenized_text = tokenizer.encode(text)
|
| 240 |
+
chunks = [tokenized_text[i:i + 512] for i in range(0, len(tokenized_text), 512)]
|
| 241 |
chunk_embeddings = []
|
| 242 |
|
| 243 |
for chunk in chunks:
|
| 244 |
+
inputs = tokenizer.encode(
|
| 245 |
+
tokenizer.decode(chunk),
|
| 246 |
return_tensors="pt",
|
| 247 |
+
padding='max_length',
|
|
|
|
| 248 |
max_length=512
|
| 249 |
)
|
| 250 |
+
inputs = inputs.to(model.device)
|
| 251 |
|
| 252 |
with torch.no_grad():
|
| 253 |
+
outputs = model(inputs)[0]
|
|
|
|
| 254 |
embedding = outputs[:, 0, :].cpu().numpy()
|
| 255 |
chunk_embeddings.append(embedding[0])
|
| 256 |
|
| 257 |
if chunk_embeddings:
|
| 258 |
+
# Weight each chunk based on its content length
|
| 259 |
+
weights = np.array([len(chunk) for chunk in chunks])
|
| 260 |
weights = weights / weights.sum()
|
| 261 |
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
|
| 262 |
return weighted_embedding
|