Spaces:
Runtime error
Runtime error
Commit
·
a8dbb61
0
Parent(s):
Duplicate from akuysal/SMS-spam-Turkish-sklearn
Browse files- .gitattributes +34 -0
- LinearSVC_SMS_spam_TR.pickle +3 -0
- README.md +22 -0
- app.py +52 -0
- requirements.txt +3 -0
- tfidf_vectorizer_TR.pickle +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
LinearSVC_SMS_spam_TR.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e1b32d1f4716a7c48facea2b8630b897be52618b461cbb2bb4f20f34b9df52f
|
| 3 |
+
size 23303
|
README.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: SMS Spam Turkish Scikit-Learn
|
| 3 |
+
emoji: 🌖
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.17.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: openrail
|
| 11 |
+
duplicated_from: akuysal/SMS-spam-Turkish-sklearn
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
ENGLISH
|
| 15 |
+
The dataset used in the study "Uysal, A. K., Gunal, S., Ergin, S., & Gunal, E. S. (2013). The impact of feature extraction and selection on SMS spam filtering. Elektronika ir Elektrotechnika, 19(5), 67-72." is employed for training. The success ratio for Linear SVM Classifier is 0.9880 in terms of Macro-F1 when 10% of the dataset was used for testing.
|
| 16 |
+
The dataset is composed of SPAM and LEGITIMATE sms data.
|
| 17 |
+
|
| 18 |
+
TÜRKÇE
|
| 19 |
+
Bu çalışmada "Uysal, A. K., Gunal, S., Ergin, S., & Gunal, E. S. (2013). The impact of feature extraction and selection on SMS spam filtering. Elektronika ir Elektrotechnika, 19(5), 67-72." başlıklı çalışmadaki veri seti kullanılmıştır. Linear SVM sınıflandırıcı için başarı oranı, veri setinin %10'u test için kullanıldığında Makro-F1 açısından 0,9880'dir.
|
| 20 |
+
Veri seti, SPAM ve LEGITIMATE kısa mesaj verilerinden oluşmaktadır.
|
| 21 |
+
|
| 22 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 2 |
+
from TurkishStemmer import TurkishStemmer
|
| 3 |
+
import string
|
| 4 |
+
# import for loading python objects (scikit-learn models)
|
| 5 |
+
import pickle
|
| 6 |
+
import nltk
|
| 7 |
+
from nltk.data import load
|
| 8 |
+
import streamlit as st
|
| 9 |
+
import sklearn
|
| 10 |
+
|
| 11 |
+
nltk.download('punkt')
|
| 12 |
+
trans_table = {ord(c): None for c in string.punctuation + string.digits}
|
| 13 |
+
|
| 14 |
+
def custom_tokenizer_with_Turkish_stemmer(text):
|
| 15 |
+
# tokenize text
|
| 16 |
+
# tokens = text.split(" ")
|
| 17 |
+
tokens = [word for word in nltk.word_tokenize(text.translate(trans_table))]
|
| 18 |
+
print(tokens)
|
| 19 |
+
stems = [stemmerTR.stem(item.lower()) for item in tokens]
|
| 20 |
+
return stems
|
| 21 |
+
|
| 22 |
+
def predictSMSdata(test_text):
|
| 23 |
+
categories = ["legitimate", "spam"]
|
| 24 |
+
categories.sort()
|
| 25 |
+
|
| 26 |
+
# load model
|
| 27 |
+
filename1 = "LinearSVC_SMS_spam_TR.pickle"
|
| 28 |
+
file_handle1 = open(filename1, "rb")
|
| 29 |
+
classifier = pickle.load(file_handle1)
|
| 30 |
+
file_handle1.close()
|
| 31 |
+
|
| 32 |
+
# load tfidf_vectorizer for transforming test text data
|
| 33 |
+
filename2 = "tfidf_vectorizer_TR.pickle"
|
| 34 |
+
file_handle2 = open(filename2, "rb")
|
| 35 |
+
tfidf_vectorizer = pickle.load(file_handle2)
|
| 36 |
+
file_handle2.close()
|
| 37 |
+
|
| 38 |
+
test_list=[test_text]
|
| 39 |
+
tfidf_vectorizer_vectors_test = tfidf_vectorizer.transform(test_list)
|
| 40 |
+
predicted = classifier.predict(tfidf_vectorizer_vectors_test)
|
| 41 |
+
print(categories[predicted[0]])
|
| 42 |
+
return categories[predicted[0]]
|
| 43 |
+
|
| 44 |
+
stemmerTR = TurkishStemmer()
|
| 45 |
+
|
| 46 |
+
# adding the text that will show in the text box
|
| 47 |
+
default_value = "Aveadan SUPER bir Muzik Paketi! MAXI yaz, 5555e gonder"
|
| 48 |
+
text = st.text_area("enter some text!", default_value)
|
| 49 |
+
if text:
|
| 50 |
+
out = predictSMSdata(text)
|
| 51 |
+
st.write("The category of SMS = " + out.upper())
|
| 52 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TurkishStemmer==1.3
|
| 2 |
+
scikit-learn>=1.1
|
| 3 |
+
nltk
|
tfidf_vectorizer_TR.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fd7ad6fcbd377d3025072502492b36208d32dba87ba4d73bd86171c48b74ba33
|
| 3 |
+
size 82481
|