| | import re |
| | import nltk |
| | nltk.download('punkt_tab') |
| | from nltk.tokenize import sent_tokenize, word_tokenize |
| | from rank_bm25 import BM25Okapi |
| | from langchain_text_splitters import NLTKTextSplitter |
| | from langchain_community.vectorstores import FAISS |
| | from langchain_openai import OpenAIEmbeddings |
| | from collections import Counter |
| |
|
| | def replace_case_insensitive(text: str, old: str, new: str) -> str: |
| | pattern = re.compile(re.escape(old), re.IGNORECASE) |
| |
|
| | return pattern.sub(new, text) |
| | def get_word_list(s1): |
| | |
| | regEx = re.compile('[\W]') |
| | res = re.compile(r"([\u4e00-\u9fa5])") |
| |
|
| | p1 = regEx.split(s1.lower()) |
| | str1_list = [] |
| | for str in p1: |
| | if res.split(str) == None: |
| | str1_list.append(str) |
| | else: |
| | ret = res.split(str) |
| | for ch in ret: |
| | str1_list.append(ch) |
| |
|
| | list_word1 = [w for w in str1_list if len(w.strip()) > 0] |
| |
|
| | return list_word1 |
| | def get_word_len(s1): |
| | return len(get_word_list(s1)) |
| |
|
| | regex = r'([。?!;\n.!?;]\s*)' |
| | def retriveDoc(text,query,top_k=3): |
| | import os |
| | sentences = sent_tokenize(text) |
| | embeddings = OpenAIEmbeddings(model="text-embedding-3-small", base_url=os.environ.get("OPENAI_BASE_URL"), |
| | api_key=os.environ.get("OPENAI_API_KEY")) |
| | |
| | vector_store = FAISS.from_texts(sentences, embeddings) |
| | |
| | retrieved_docs = vector_store.similarity_search(query, k=top_k) |
| | print("Retrieved sentences:", retrieved_docs) |
| | |
| | |
| | return retrieved_docs |
| |
|
| |
|
| | def most_similar_sentence_bm25(paragraph, target_sentence): |
| | """ |
| | Use BM25 algorithm to find the most similar sentence to target_sentence in the given paragraph, |
| | return (most similar sentence, score). |
| | """ |
| | |
| | sentences = sent_tokenize(paragraph) |
| |
|
| | |
| | tokenized_sentences = [word_tokenize(sent) for sent in sentences] |
| |
|
| | |
| | bm25 = BM25Okapi(tokenized_sentences) |
| |
|
| | |
| | target_tokens = word_tokenize(target_sentence) |
| |
|
| | |
| | scores = bm25.get_scores(target_tokens) |
| | |
| |
|
| | |
| | max_idx = scores.argmax() |
| |
|
| | |
| | return sentences[max_idx] |
| |
|
| |
|
| | def f1_score_text(pred, gold): |
| | pred_tokens = word_tokenize(pred) |
| | gold_tokens = word_tokenize(gold) |
| | common = Counter(pred_tokens) & Counter(gold_tokens) |
| | num_same = sum(common.values()) |
| | if num_same == 0: |
| | return 0.0 |
| | precision = num_same / len(pred_tokens) |
| | recall = num_same / len(gold_tokens) |
| | f1 = 2 * precision * recall / (precision + recall) |
| | return f1 |
| |
|
| | def compute_best_sentence_f1(pred_text, gold_text): |
| | pred_sentences = sent_tokenize(pred_text) |
| | gold_sentences = sent_tokenize(gold_text) |
| | f1_scores = [] |
| | for pred in pred_sentences: |
| | best_f1 = 0.0 |
| | for gold in gold_sentences: |
| | f1 = f1_score_text(pred, gold) |
| | if f1 > best_f1: |
| | best_f1 = f1 |
| | f1_scores.append(best_f1) |
| | avg_f1 = sum(f1_scores) / len(pred_sentences) if pred_sentences else 0.0 |
| | return avg_f1 |