Az-r-ow
commited on
Commit
·
aba5a2f
1
Parent(s):
db55dfb
feat(processing): option to remove stopwords or not
Browse files
app/travel_resolver/libs/nlp/data_processing.py
CHANGED
|
@@ -38,6 +38,7 @@ def get_tagged_content(sentence: str, tag: str) -> str | None:
|
|
| 38 |
|
| 39 |
def process_sentence(
|
| 40 |
sentence: str,
|
|
|
|
| 41 |
stemming: bool = False,
|
| 42 |
return_tokens: bool = False,
|
| 43 |
labels_to_adapt: list[int | str] | None = None,
|
|
@@ -50,6 +51,7 @@ def process_sentence(
|
|
| 50 |
|
| 51 |
Args:
|
| 52 |
sentence (str): The sentence to process.
|
|
|
|
| 53 |
stemming (bool): Whether to stem the tokens.
|
| 54 |
return_tokens (bool): Whether to return the tokens instead of the sentence.
|
| 55 |
labels_to_adapt (list[int | str] | None): The labels to adapt.
|
|
@@ -68,7 +70,7 @@ def process_sentence(
|
|
| 68 |
|
| 69 |
for token, label in zip(tokenized_sentence, labels_to_adapt):
|
| 70 |
# Skipping stopwords
|
| 71 |
-
if token in stopwords:
|
| 72 |
continue
|
| 73 |
token = token if not stemming else stemmer.stem(token)
|
| 74 |
processed_sentence += token + " "
|
|
|
|
| 38 |
|
| 39 |
def process_sentence(
|
| 40 |
sentence: str,
|
| 41 |
+
rm_stopwords: bool = False,
|
| 42 |
stemming: bool = False,
|
| 43 |
return_tokens: bool = False,
|
| 44 |
labels_to_adapt: list[int | str] | None = None,
|
|
|
|
| 51 |
|
| 52 |
Args:
|
| 53 |
sentence (str): The sentence to process.
|
| 54 |
+
rm_stopwords (bool): Whether to remove stopwords.
|
| 55 |
stemming (bool): Whether to stem the tokens.
|
| 56 |
return_tokens (bool): Whether to return the tokens instead of the sentence.
|
| 57 |
labels_to_adapt (list[int | str] | None): The labels to adapt.
|
|
|
|
| 70 |
|
| 71 |
for token, label in zip(tokenized_sentence, labels_to_adapt):
|
| 72 |
# Skipping stopwords
|
| 73 |
+
if token in stopwords and rm_stopwords:
|
| 74 |
continue
|
| 75 |
token = token if not stemming else stemmer.stem(token)
|
| 76 |
processed_sentence += token + " "
|