Spaces:
Running
Running
Yaron Koresh
commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -558,24 +558,45 @@ def summarize(
|
|
| 558 |
text, max_len=20, min_len=10
|
| 559 |
):
|
| 560 |
log(f'CALL summarize')
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
|
|
|
|
|
|
| 564 |
return text
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
log(f'RET summarize with summary as {summary}')
|
| 580 |
return summary
|
| 581 |
|
|
@@ -633,6 +654,64 @@ def all_pipes(pos,neg,artist,song):
|
|
| 633 |
|
| 634 |
return imgs
|
| 635 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
def translate(txt,to_lang="en",from_lang=False):
|
| 637 |
log(f'CALL translate')
|
| 638 |
if not from_lang:
|
|
@@ -640,12 +719,13 @@ def translate(txt,to_lang="en",from_lang=False):
|
|
| 640 |
if(from_lang == to_lang):
|
| 641 |
log(f'RET translate with txt as {txt}')
|
| 642 |
return txt
|
| 643 |
-
|
| 644 |
-
|
| 645 |
ret = ""
|
| 646 |
-
for index in
|
| 647 |
-
chunk =
|
| 648 |
-
|
|
|
|
| 649 |
toks = tokenizer.decode(gen[0], skip_special_tokens=True)
|
| 650 |
ret = ret + ("" if ret == "" else " ") + toks
|
| 651 |
log(f'RET translate with ret as {ret}')
|
|
|
|
| 558 |
text, max_len=20, min_len=10
|
| 559 |
):
|
| 560 |
log(f'CALL summarize')
|
| 561 |
+
|
| 562 |
+
words = text.split()
|
| 563 |
+
|
| 564 |
+
if get_tensor_length(words) < 5:
|
| 565 |
+
print("Summarization Error: Text is too short, 5 words minimum!")
|
| 566 |
return text
|
| 567 |
+
|
| 568 |
+
prefix = "summarize: "
|
| 569 |
+
ret = ""
|
| 570 |
+
|
| 571 |
+
for index in math.ceil( len(words) / 512 ):
|
| 572 |
+
|
| 573 |
+
chunk = " ".join(words[ index*512:(index+1)*512 ])
|
| 574 |
+
inputs = tokenizer.encode( prefix + chunk, return_tensors="pt", max_length=float('inf'), truncation=False)
|
| 575 |
+
|
| 576 |
+
while get_tensor_length(inputs) > max_len:
|
| 577 |
+
|
| 578 |
+
inputs = model.generate(
|
| 579 |
+
inputs,
|
| 580 |
+
length_penalty=2.0,
|
| 581 |
+
num_beams=4,
|
| 582 |
+
early_stopping=True,
|
| 583 |
+
max_length=max( get_tensor_length(inputs) // 4 , max_len ),
|
| 584 |
+
min_length=min_len
|
| 585 |
+
)
|
| 586 |
+
|
| 587 |
+
toks = tokenizer.decode(inputs[0], skip_special_tokens=True)
|
| 588 |
+
ret = ret + ("" if ret == "" else " ") + toks
|
| 589 |
+
|
| 590 |
+
inputs = tokenizer.encode( prefix + ret, return_tensors="pt", max_length=float('inf'), truncation=False)
|
| 591 |
+
gen = model.generate(
|
| 592 |
+
inputs,
|
| 593 |
+
length_penalty=1.0,
|
| 594 |
+
num_beams=4,
|
| 595 |
+
early_stopping=True,
|
| 596 |
+
max_length=max_len,
|
| 597 |
+
min_length=min_len
|
| 598 |
+
)
|
| 599 |
+
summary = tokenizer.decode(gen[0], skip_special_tokens=True)
|
| 600 |
log(f'RET summarize with summary as {summary}')
|
| 601 |
return summary
|
| 602 |
|
|
|
|
| 654 |
|
| 655 |
return imgs
|
| 656 |
|
| 657 |
+
language_codes = {
|
| 658 |
+
"af": "Afrikaans",
|
| 659 |
+
"ar": "Arabic",
|
| 660 |
+
"bg": "Bulgarian",
|
| 661 |
+
"bn": "Bengali",
|
| 662 |
+
"ca": "Catalan",
|
| 663 |
+
"cs": "Czech",
|
| 664 |
+
"cy": "Welsh",
|
| 665 |
+
"da": "Danish",
|
| 666 |
+
"de": "German",
|
| 667 |
+
"el": "Greek",
|
| 668 |
+
"en": "English",
|
| 669 |
+
"es": "Spanish",
|
| 670 |
+
"et": "Estonian",
|
| 671 |
+
"fa": "Persian (Farsi)",
|
| 672 |
+
"fi": "Finnish",
|
| 673 |
+
"fr": "French",
|
| 674 |
+
"gu": "Gujarati",
|
| 675 |
+
"he": "Hebrew",
|
| 676 |
+
"hi": "Hindi",
|
| 677 |
+
"hr": "Croatian",
|
| 678 |
+
"hu": "Hungarian",
|
| 679 |
+
"id": "Indonesian",
|
| 680 |
+
"it": "Italian",
|
| 681 |
+
"ja": "Japanese",
|
| 682 |
+
"kn": "Kannada",
|
| 683 |
+
"ko": "Korean",
|
| 684 |
+
"lt": "Lithuanian",
|
| 685 |
+
"lv": "Latvian",
|
| 686 |
+
"mk": "Macedonian",
|
| 687 |
+
"ml": "Malayalam",
|
| 688 |
+
"mr": "Marathi",
|
| 689 |
+
"ne": "Nepali",
|
| 690 |
+
"nl": "Dutch",
|
| 691 |
+
"no": "Norwegian",
|
| 692 |
+
"pa": "Punjabi",
|
| 693 |
+
"pl": "Polish",
|
| 694 |
+
"pt": "Portuguese",
|
| 695 |
+
"ro": "Romanian",
|
| 696 |
+
"ru": "Russian",
|
| 697 |
+
"sk": "Slovak",
|
| 698 |
+
"sl": "Slovenian",
|
| 699 |
+
"so": "Somali",
|
| 700 |
+
"sq": "Albanian",
|
| 701 |
+
"sv": "Swedish",
|
| 702 |
+
"sw": "Swahili",
|
| 703 |
+
"ta": "Tamil",
|
| 704 |
+
"te": "Telugu",
|
| 705 |
+
"th": "Thai",
|
| 706 |
+
"tl": "Tagalog (Filipino)",
|
| 707 |
+
"tr": "Turkish",
|
| 708 |
+
"uk": "Ukrainian",
|
| 709 |
+
"ur": "Urdu",
|
| 710 |
+
"vi": "Vietnamese",
|
| 711 |
+
"zh-cn": "Chinese (Simplified)",
|
| 712 |
+
"zh-tw": "Chinese (Traditional)",
|
| 713 |
+
}
|
| 714 |
+
|
| 715 |
def translate(txt,to_lang="en",from_lang=False):
|
| 716 |
log(f'CALL translate')
|
| 717 |
if not from_lang:
|
|
|
|
| 719 |
if(from_lang == to_lang):
|
| 720 |
log(f'RET translate with txt as {txt}')
|
| 721 |
return txt
|
| 722 |
+
prefix = f"translate {language_codes[from_lang]} to {language_codes[to_lang]}: "
|
| 723 |
+
words = txt.split()
|
| 724 |
ret = ""
|
| 725 |
+
for index in math.ceil( len(words) / 512 ):
|
| 726 |
+
chunk = " ".join(words[ index*512:(index+1)*512 ])
|
| 727 |
+
inputs = tokenizer.encode(prefix+chunk, return_tensors="pt", max_length=float('inf'), truncation=False)
|
| 728 |
+
gen = model.generate(chunk,input)
|
| 729 |
toks = tokenizer.decode(gen[0], skip_special_tokens=True)
|
| 730 |
ret = ret + ("" if ret == "" else " ") + toks
|
| 731 |
log(f'RET translate with ret as {ret}')
|