Spaces:
Running
Running
Yaron Koresh
commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -559,51 +559,54 @@ def get_tensor_length(tensor):
|
|
| 559 |
ret = ret * num
|
| 560 |
return ret
|
| 561 |
|
| 562 |
-
def
|
| 563 |
-
|
| 564 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
log(f'CALL summarize')
|
| 566 |
|
| 567 |
words = text.split()
|
| 568 |
|
| 569 |
if len(words) < 5:
|
| 570 |
-
print("Summarization Error: Text is too short, 5 words minimum
|
| 571 |
return text
|
| 572 |
|
| 573 |
-
|
| 574 |
-
|
|
|
|
| 575 |
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
inputs = model.generate(
|
| 584 |
-
inputs,
|
| 585 |
-
length_penalty=2.0,
|
| 586 |
-
num_beams=4,
|
| 587 |
-
early_stopping=True,
|
| 588 |
-
max_length=max( get_tensor_length(inputs) // 4 , max_len ),
|
| 589 |
-
min_length=min_len
|
| 590 |
-
)
|
| 591 |
-
|
| 592 |
-
toks = tokenizer.decode(inputs[0], skip_special_tokens=True)
|
| 593 |
-
ret = ret + ("" if ret == "" else " ") + toks
|
| 594 |
-
|
| 595 |
-
inputs = tokenizer.encode( prefix + ret, return_tensors="pt", truncation=False)
|
| 596 |
-
gen = model.generate(
|
| 597 |
-
inputs,
|
| 598 |
-
length_penalty=1.0,
|
| 599 |
-
num_beams=4,
|
| 600 |
-
early_stopping=True,
|
| 601 |
-
max_length=max_len,
|
| 602 |
-
min_length=min_len
|
| 603 |
-
)
|
| 604 |
-
summary = tokenizer.decode(gen[0], skip_special_tokens=True)
|
| 605 |
-
log(f'RET summarize with summary as {summary}')
|
| 606 |
-
return summary
|
| 607 |
|
| 608 |
def generate_random_string(length):
|
| 609 |
characters = str(ascii_letters + digits)
|
|
|
|
| 559 |
ret = ret * num
|
| 560 |
return ret
|
| 561 |
|
| 562 |
+
def _summarize(text, max_words=20):
|
| 563 |
+
prefix = "summarize: "
|
| 564 |
+
toks = tokenizer.encode( prefix + text, return_tensors="pt", truncation=False)
|
| 565 |
+
gen = model.generate(
|
| 566 |
+
toks,
|
| 567 |
+
length_penalty=2.0,
|
| 568 |
+
num_beams=max( get_tensor_length(toks) // 4 , 4 ),
|
| 569 |
+
early_stopping=True,
|
| 570 |
+
max_length=max_words
|
| 571 |
+
)
|
| 572 |
+
return tokenizer.decode(gen[0], skip_special_tokens=True)
|
| 573 |
+
|
| 574 |
+
def _summ_step(length):
|
| 575 |
+
return max(length // 3 , min(10,length-4))
|
| 576 |
+
|
| 577 |
+
def summarize(text, max_words=20):
|
| 578 |
log(f'CALL summarize')
|
| 579 |
|
| 580 |
words = text.split()
|
| 581 |
|
| 582 |
if len(words) < 5:
|
| 583 |
+
print("Summarization Error: Text is too short, 5 words minimum.")
|
| 584 |
return text
|
| 585 |
|
| 586 |
+
if max_words < 5 or max_words > 500:
|
| 587 |
+
print("Summarization Error: max_words value must be between 5 and 500 words.")
|
| 588 |
+
return text
|
| 589 |
|
| 590 |
+
words_length = len(text.split())
|
| 591 |
+
|
| 592 |
+
if words_length >= 510:
|
| 593 |
+
shrink_step = 500 // (len(text.split()) / 500)
|
| 594 |
+
while words_length >= 510:
|
| 595 |
+
words = text.split()
|
| 596 |
+
above_limit = words[510:]
|
| 597 |
+
text = _summarize(
|
| 598 |
+
" ".join(words[0:510]), shrink_step
|
| 599 |
+
) + " ".join(words[510:])
|
| 600 |
+
words_length = len(text.split())
|
| 601 |
+
|
| 602 |
+
while words_length > max_words:
|
| 603 |
+
step = _summ_step(words_length)
|
| 604 |
+
mx = words_length - step
|
| 605 |
+
text = _summarize(text, mx)
|
| 606 |
+
words_length = len(text.split())
|
| 607 |
|
| 608 |
+
log(f'RET summarize with text as {text}')
|
| 609 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
|
| 611 |
def generate_random_string(length):
|
| 612 |
characters = str(ascii_letters + digits)
|