Spaces:
Build error
Build error
Upload 2 files
Browse files- .gitattributes +1 -0
- combined.ipynb +512 -79
- processed_data.csv +3 -0
.gitattributes
CHANGED
|
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
combined_output.csv filter=lfs diff=lfs merge=lfs -text
|
| 37 |
combined_texts.csv filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
combined_output.csv filter=lfs diff=lfs merge=lfs -text
|
| 37 |
combined_texts.csv filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
processed_data.csv filter=lfs diff=lfs merge=lfs -text
|
combined.ipynb
CHANGED
|
@@ -9,7 +9,7 @@
|
|
| 9 |
},
|
| 10 |
{
|
| 11 |
"cell_type": "code",
|
| 12 |
-
"execution_count":
|
| 13 |
"metadata": {},
|
| 14 |
"outputs": [],
|
| 15 |
"source": [
|
|
@@ -35,87 +35,392 @@
|
|
| 35 |
"cell_type": "markdown",
|
| 36 |
"metadata": {},
|
| 37 |
"source": [
|
| 38 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
]
|
| 40 |
},
|
| 41 |
{
|
| 42 |
"cell_type": "code",
|
| 43 |
-
"execution_count":
|
| 44 |
"metadata": {},
|
| 45 |
"outputs": [],
|
| 46 |
"source": [
|
| 47 |
"#mongodb üzerinden combined_textleri çek\n",
|
|
|
|
|
|
|
| 48 |
"\n",
|
| 49 |
-
"def
|
| 50 |
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
| 51 |
" db = client[database_name]\n",
|
| 52 |
" collection = db[collection_name]\n",
|
| 53 |
" \n",
|
| 54 |
" #toplam döküman sayısını al\n",
|
| 55 |
" total_documents = collection.count_documents({})\n",
|
| 56 |
-
" batch_documents = []\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
"\n",
|
| 58 |
" # Belirtilen batch_size kadar dökümanları almak için döngü\n",
|
| 59 |
" for i in range(0, total_documents, batch_size):\n",
|
| 60 |
" cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
|
| 61 |
-
" combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc]\n",
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
"\n",
|
| 66 |
"# Dökümanları ve döküman sayısını batch olarak çekin\n",
|
| 67 |
-
"
|
| 68 |
"\n",
|
| 69 |
"# Her batch'i ayrı ayrı işleyebilirsiniz\n",
|
| 70 |
-
"
|
| 71 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
"\n",
|
| 73 |
" "
|
| 74 |
]
|
| 75 |
},
|
| 76 |
{
|
| 77 |
-
"cell_type": "
|
|
|
|
| 78 |
"metadata": {},
|
| 79 |
-
"
|
| 80 |
-
|
| 81 |
-
]
|
| 82 |
},
|
| 83 |
{
|
| 84 |
"cell_type": "code",
|
| 85 |
-
"execution_count":
|
| 86 |
"metadata": {},
|
| 87 |
"outputs": [],
|
| 88 |
"source": [
|
| 89 |
-
"\
|
| 90 |
-
"
|
| 91 |
-
"
|
| 92 |
-
"
|
| 93 |
-
"
|
|
|
|
|
|
|
|
|
|
| 94 |
" \n",
|
| 95 |
-
"
|
| 96 |
-
"
|
| 97 |
-
"
|
| 98 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
" \n",
|
| 100 |
-
"
|
| 101 |
-
"
|
| 102 |
-
" word = line.strip()\n",
|
| 103 |
-
" if word and word not in existing_stop_words:\n",
|
| 104 |
-
" existing_stop_words.add(word)\n",
|
| 105 |
" \n",
|
| 106 |
-
" return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
"\n",
|
| 108 |
-
"
|
| 109 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
"\n",
|
| 111 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
]
|
| 113 |
},
|
| 114 |
{
|
| 115 |
-
"cell_type": "
|
|
|
|
| 116 |
"metadata": {},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
"source": [
|
| 118 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
]
|
| 120 |
},
|
| 121 |
{
|
|
@@ -123,6 +428,89 @@
|
|
| 123 |
"execution_count": null,
|
| 124 |
"metadata": {},
|
| 125 |
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
"source": [
|
| 127 |
"\n",
|
| 128 |
"# BERT Tokenizer ve Model'i yükleyin\n",
|
|
@@ -132,17 +520,23 @@
|
|
| 132 |
"def extract_keywords_tfidf(combined_texts, stop_words_list,top_n=5):\n",
|
| 133 |
" \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
|
| 134 |
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
| 135 |
-
" X = vectorizer.fit_transform(combined_texts)\n",
|
| 136 |
" feature_names = vectorizer.get_feature_names_out()\n",
|
| 137 |
" #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
|
| 138 |
-
"
|
| 139 |
-
"\n",
|
|
|
|
|
|
|
| 140 |
" for row in X:\n",
|
| 141 |
" tfidf_scores = row.toarray().flatten()\n",
|
| 142 |
" top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
|
| 143 |
" top_keywords = [feature_names[i] for i in top_indices]\n",
|
| 144 |
-
"
|
|
|
|
| 145 |
"\n",
|
|
|
|
|
|
|
|
|
|
| 146 |
" return top_keywords_per_document\n",
|
| 147 |
"\n",
|
| 148 |
"# Anahtar kelimeleri çıkar ve BERT ile embedding oluştur\n",
|
|
@@ -151,7 +545,7 @@
|
|
| 151 |
" \n",
|
| 152 |
" for text in combined_texts:\n",
|
| 153 |
" # Anahtar kelimeleri çıkar\n",
|
| 154 |
-
" keywords = extract_keywords_tfidf(text, stop_words_list,top_n)\n",
|
| 155 |
" \n",
|
| 156 |
" # BERT ile embedding oluştur\n",
|
| 157 |
" inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)\n",
|
|
@@ -165,62 +559,101 @@
|
|
| 165 |
" 'embedding': embeddings\n",
|
| 166 |
" })\n",
|
| 167 |
" \n",
|
| 168 |
-
" return results"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
]
|
| 170 |
},
|
| 171 |
{
|
| 172 |
"cell_type": "code",
|
| 173 |
-
"execution_count":
|
| 174 |
"metadata": {},
|
| 175 |
"outputs": [
|
| 176 |
{
|
| 177 |
"name": "stdout",
|
| 178 |
"output_type": "stream",
|
| 179 |
"text": [
|
| 180 |
-
"
|
| 181 |
]
|
| 182 |
}
|
| 183 |
],
|
| 184 |
"source": [
|
| 185 |
-
"
|
| 186 |
-
"
|
| 187 |
-
"\n",
|
| 188 |
-
"def calculate_keyword_similarity(text, keywords):\n",
|
| 189 |
-
" # Metin ve anahtar kelimelerden oluşan bir liste oluştur\n",
|
| 190 |
-
" similarity_array = []\n",
|
| 191 |
"\n",
|
| 192 |
-
"
|
| 193 |
-
"
|
| 194 |
-
"
|
| 195 |
-
" \n",
|
| 196 |
-
" # TF-IDF matrisini oluştur\n",
|
| 197 |
-
" vectorizer = TfidfVectorizer()\n",
|
| 198 |
-
" tfidf_matrix = vectorizer.fit_transform(documents)\n",
|
| 199 |
-
" \n",
|
| 200 |
-
" # Metin vektörünü ve anahtar kelimeler vektörünü al\n",
|
| 201 |
-
" text_vector = tfidf_matrix[0]\n",
|
| 202 |
-
" keywords_vector = tfidf_matrix[1]\n",
|
| 203 |
" \n",
|
| 204 |
-
" #
|
| 205 |
-
"
|
|
|
|
| 206 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
"\n",
|
| 208 |
-
"
|
| 209 |
-
"
|
| 210 |
-
"
|
| 211 |
-
"#
|
| 212 |
-
"
|
| 213 |
-
"
|
| 214 |
-
"
|
| 215 |
-
"
|
| 216 |
-
"
|
| 217 |
-
"
|
| 218 |
-
"
|
|
|
|
|
|
|
| 219 |
"\n",
|
| 220 |
-
"
|
| 221 |
-
"
|
| 222 |
-
"
|
| 223 |
-
"\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
]
|
| 225 |
},
|
| 226 |
{
|
|
|
|
| 9 |
},
|
| 10 |
{
|
| 11 |
"cell_type": "code",
|
| 12 |
+
"execution_count": 3,
|
| 13 |
"metadata": {},
|
| 14 |
"outputs": [],
|
| 15 |
"source": [
|
|
|
|
| 35 |
"cell_type": "markdown",
|
| 36 |
"metadata": {},
|
| 37 |
"source": [
|
| 38 |
+
"Turkish stop wordslerin tanımlanması"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"execution_count": 4,
|
| 44 |
+
"metadata": {},
|
| 45 |
+
"outputs": [],
|
| 46 |
+
"source": [
|
| 47 |
+
"\"\"\"\"\"\"\n",
|
| 48 |
+
"#- burada turkish_stop_words'ü alıyoruz\n",
|
| 49 |
+
"def load_stop_words(file_path, existing_stop_words='gereksiz_kelimeler.txt'):\n",
|
| 50 |
+
" \"\"\"Stop words'leri dosyadan okuyarak bir liste oluşturur. \n",
|
| 51 |
+
" Mevcut stop words'ler varsa bunları dikkate alır.\"\"\"\n",
|
| 52 |
+
" \n",
|
| 53 |
+
" if existing_stop_words is None:\n",
|
| 54 |
+
" existing_stop_words = set()\n",
|
| 55 |
+
" else:\n",
|
| 56 |
+
" existing_stop_words = set(existing_stop_words)\n",
|
| 57 |
+
" \n",
|
| 58 |
+
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
| 59 |
+
" for line in file:\n",
|
| 60 |
+
" word = line.strip()\n",
|
| 61 |
+
" if word and word not in existing_stop_words:\n",
|
| 62 |
+
" existing_stop_words.add(word)\n",
|
| 63 |
+
" \n",
|
| 64 |
+
" return list(existing_stop_words)\n",
|
| 65 |
+
"\n",
|
| 66 |
+
"# Mevcut stop words'leri kontrol ederek Türkçe stop words dosyasını yükleyin\n",
|
| 67 |
+
"stop_words_list = load_stop_words('gereksiz_kelimeler.txt')\n",
|
| 68 |
+
"\n",
|
| 69 |
+
"#----------------------------------------------------------------------------------------------------"
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"cell_type": "markdown",
|
| 74 |
+
"metadata": {},
|
| 75 |
+
"source": [
|
| 76 |
+
"MongoDb'deki combined_text koleksiyonunun verilerini csv ye çekme "
|
| 77 |
]
|
| 78 |
},
|
| 79 |
{
|
| 80 |
"cell_type": "code",
|
| 81 |
+
"execution_count": null,
|
| 82 |
"metadata": {},
|
| 83 |
"outputs": [],
|
| 84 |
"source": [
|
| 85 |
"#mongodb üzerinden combined_textleri çek\n",
|
| 86 |
+
"import csv\n",
|
| 87 |
+
"from pymongo import MongoClient\n",
|
| 88 |
"\n",
|
| 89 |
+
"def mongo_db_combined_texts_to_csv(database_name='combined_text', collection_name='text', host='localhost', port=27017,batch_size=100,output_file='combined_texts.csv'):\n",
|
| 90 |
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
| 91 |
" db = client[database_name]\n",
|
| 92 |
" collection = db[collection_name]\n",
|
| 93 |
" \n",
|
| 94 |
" #toplam döküman sayısını al\n",
|
| 95 |
" total_documents = collection.count_documents({})\n",
|
| 96 |
+
" #batch_documents = []\n",
|
| 97 |
+
"\n",
|
| 98 |
+
" # CSV dosyasını aç ve yazmaya hazırla\n",
|
| 99 |
+
" with open(output_file, mode='w', newline='', encoding='utf-8') as file:\n",
|
| 100 |
+
" writer = csv.writer(file)\n",
|
| 101 |
+
" writer.writerow([\"combined\"]) # CSV başlığı\n",
|
| 102 |
"\n",
|
| 103 |
" # Belirtilen batch_size kadar dökümanları almak için döngü\n",
|
| 104 |
" for i in range(0, total_documents, batch_size):\n",
|
| 105 |
" cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
|
| 106 |
+
" combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc] #combined sütununa ilişkin verileri çeker \n",
|
| 107 |
+
"\n",
|
| 108 |
+
" # Batch verilerini CSV'ye yaz\n",
|
| 109 |
+
" with open(output_file, mode='a', newline='', encoding='utf-8') as file:\n",
|
| 110 |
+
" writer = csv.writer(file)\n",
|
| 111 |
+
" \n",
|
| 112 |
+
" for text in combined_texts:\n",
|
| 113 |
+
" writer.writerow([text])\n",
|
| 114 |
+
" \n",
|
| 115 |
+
" \n",
|
| 116 |
+
"\n",
|
| 117 |
+
" print(f\"combined metinler '{output_file}' dosyasına başarıyla yazıldı.\")\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"# Dökümanları CSV dosyasına yazdır\n",
|
| 120 |
+
"text=mongo_db_combined_texts_to_csv(batch_size=100)\n",
|
| 121 |
+
" #batch_documents.extend((combined_texts, len(combined_texts)))\n",
|
| 122 |
+
" #append fonksiyonu listenin içerisine tek bir eleman gibi ekler yani list1 = [1, 2, 3, [4, 5]]\n",
|
| 123 |
+
" #fakat extend fonksiyonu list1 = [1, 2, 3, 4, 5] bir listeye yeni bir liste eklemeyi teker teker gerçekleştirir.\n",
|
| 124 |
+
" #return batch_documents\n",
|
| 125 |
"\n",
|
| 126 |
"# Dökümanları ve döküman sayısını batch olarak çekin\n",
|
| 127 |
+
"#combined_texts = mongo_db_combined_texts(batch_size=1000)\n",
|
| 128 |
"\n",
|
| 129 |
"# Her batch'i ayrı ayrı işleyebilirsiniz\n",
|
| 130 |
+
"#print(f\"Toplam döküman sayısı:{len(combined_texts)}\")\n",
|
| 131 |
+
"\n",
|
| 132 |
+
"#for index, text in enumerate (combined_texts[:10]):\n",
|
| 133 |
+
" #print(f\"Döküman {index + 1}: {text}\")\n",
|
| 134 |
+
"\n",
|
| 135 |
+
"#print(combined_texts)\n",
|
| 136 |
"\n",
|
| 137 |
" "
|
| 138 |
]
|
| 139 |
},
|
| 140 |
{
|
| 141 |
+
"cell_type": "code",
|
| 142 |
+
"execution_count": null,
|
| 143 |
"metadata": {},
|
| 144 |
+
"outputs": [],
|
| 145 |
+
"source": []
|
|
|
|
| 146 |
},
|
| 147 |
{
|
| 148 |
"cell_type": "code",
|
| 149 |
+
"execution_count": 11,
|
| 150 |
"metadata": {},
|
| 151 |
"outputs": [],
|
| 152 |
"source": [
|
| 153 |
+
"import csv\n",
|
| 154 |
+
"from pymongo import MongoClient\n",
|
| 155 |
+
"import pandas as pd\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"def fetch_from_database(database_name='combined_text', collection_name='text', host='localhost', port=27017, batch_size=100):\n",
|
| 158 |
+
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
| 159 |
+
" db = client[database_name]\n",
|
| 160 |
+
" collection = db[collection_name]\n",
|
| 161 |
" \n",
|
| 162 |
+
" # Toplam döküman sayısını al\n",
|
| 163 |
+
" total_documents = collection.count_documents({})\n",
|
| 164 |
+
" combined_texts = []\n",
|
| 165 |
+
"\n",
|
| 166 |
+
" # Belirtilen batch_size kadar dökümanları almak için döngü\n",
|
| 167 |
+
" for i in range(0, total_documents, batch_size):\n",
|
| 168 |
+
" cursor = collection.find({}, {\"combined\": 1, \"_id\": 0}).skip(i).limit(batch_size)\n",
|
| 169 |
+
" combined_texts.extend([doc['combined'] for doc in cursor if 'combined' in doc]) # combined sütununa ilişkin verileri çeker \n",
|
| 170 |
+
"\n",
|
| 171 |
+
" return combined_texts\n",
|
| 172 |
+
"\n",
|
| 173 |
+
"# Metinleri kısaltma fonksiyonu\n",
|
| 174 |
+
"def truncate_text(text, max_words=300):\n",
|
| 175 |
+
" words = text.split() # Metni kelimelere böler\n",
|
| 176 |
+
" return ' '.join(words[:max_words]) # İlk max_words kadar kelimeyi alır\n",
|
| 177 |
+
"\n",
|
| 178 |
+
"# Veritabanından veri çekme ve kısaltma\n",
|
| 179 |
+
"def fetch_and_truncate_data(database_name, collection_name, host, port, max_words=300):\n",
|
| 180 |
+
" # Veritabanından veri çekme\n",
|
| 181 |
+
" combined_texts = fetch_from_database(database_name, collection_name, host, port)\n",
|
| 182 |
" \n",
|
| 183 |
+
" # Metinleri kısaltma\n",
|
| 184 |
+
" truncated_texts = [truncate_text(text, max_words) for text in combined_texts]\n",
|
|
|
|
|
|
|
|
|
|
| 185 |
" \n",
|
| 186 |
+
" return truncated_texts\n",
|
| 187 |
+
"\n",
|
| 188 |
+
"# Kısaltılmış veriyi CSV'ye kaydetme\n",
|
| 189 |
+
"def save_to_csv(data, file_path):\n",
|
| 190 |
+
" df = pd.DataFrame(data, columns=['combined'])\n",
|
| 191 |
+
" df.to_csv(file_path, encoding='utf-8', index=False)\n",
|
| 192 |
+
"\n",
|
| 193 |
+
"# Doğru değişken tanımlamaları\n",
|
| 194 |
+
"database_name = 'combined_text'\n",
|
| 195 |
+
"collection_name = 'text'\n",
|
| 196 |
+
"host = 'localhost'\n",
|
| 197 |
+
"port = 27017\n",
|
| 198 |
+
"batch_size = 100\n",
|
| 199 |
+
"max_words = 300\n",
|
| 200 |
+
"output_file = 'processed_data.csv'\n",
|
| 201 |
+
"\n",
|
| 202 |
+
"# Veriyi çekme ve işleme\n",
|
| 203 |
+
"truncated_texts = fetch_and_truncate_data(database_name, collection_name, host, port, max_words)\n",
|
| 204 |
+
"save_to_csv(truncated_texts, output_file)\n"
|
| 205 |
+
]
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"cell_type": "code",
|
| 209 |
+
"execution_count": null,
|
| 210 |
+
"metadata": {},
|
| 211 |
+
"outputs": [],
|
| 212 |
+
"source": []
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"cell_type": "markdown",
|
| 216 |
+
"metadata": {},
|
| 217 |
+
"source": [
|
| 218 |
+
"Tf-Idf ile keywordsleri alma "
|
| 219 |
+
]
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"cell_type": "code",
|
| 223 |
+
"execution_count": 11,
|
| 224 |
+
"metadata": {},
|
| 225 |
+
"outputs": [
|
| 226 |
+
{
|
| 227 |
+
"name": "stderr",
|
| 228 |
+
"output_type": "stream",
|
| 229 |
+
"text": [
|
| 230 |
+
"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['leh'] not in stop_words.\n",
|
| 231 |
+
" warnings.warn(\n"
|
| 232 |
+
]
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"ename": "KeyboardInterrupt",
|
| 236 |
+
"evalue": "",
|
| 237 |
+
"output_type": "error",
|
| 238 |
+
"traceback": [
|
| 239 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 240 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
| 241 |
+
"Cell \u001b[1;32mIn[11], line 33\u001b[0m\n\u001b[0;32m 30\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords_per_document, top_tfidf_scores_per_document\n\u001b[0;32m 32\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeleri çıkar ve sonuçları al\u001b[39;00m\n\u001b[1;32m---> 33\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcombined\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_n\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# Sonuçları görüntüleme\u001b[39;00m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, (keywords, scores) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mzip\u001b[39m(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
|
| 242 |
+
"Cell \u001b[1;32mIn[11], line 21\u001b[0m, in \u001b[0;36mextract_keywords_tfidf\u001b[1;34m(combined, stop_words_list, top_n)\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m X:\n\u001b[0;32m 20\u001b[0m tfidf_scores \u001b[38;5;241m=\u001b[39m row\u001b[38;5;241m.\u001b[39mtoarray()\u001b[38;5;241m.\u001b[39mflatten() \u001b[38;5;66;03m#değişkenleri düz bir değişken haline getirme\u001b[39;00m\n\u001b[1;32m---> 21\u001b[0m top_indices \u001b[38;5;241m=\u001b[39m \u001b[43mtfidf_scores\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margsort\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m-\u001b[39mtop_n:][::\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m] \u001b[38;5;66;03m# En yüksek n skoru bul\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m#en yüksek skorlu kelimleri ve skorları bul\u001b[39;00m\n\u001b[0;32m 24\u001b[0m top_keywords \u001b[38;5;241m=\u001b[39m [feature_names[i] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m top_indices]\n",
|
| 243 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
| 244 |
+
]
|
| 245 |
+
}
|
| 246 |
+
],
|
| 247 |
+
"source": [
|
| 248 |
+
"import csv\n",
|
| 249 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
| 250 |
+
"from joblib import Parallel, delayed\n",
|
| 251 |
+
"import pandas as pd\n",
|
| 252 |
"\n",
|
| 253 |
+
"df=pd.read_csv('combined_texts.csv')\n",
|
| 254 |
+
"combined= df['combined'].tolist()\n",
|
| 255 |
+
"def extract_keywords_tfidf(combined, stop_words_list,top_n=10):\n",
|
| 256 |
+
" \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
|
| 257 |
+
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
| 258 |
+
" X = vectorizer.fit_transform(combined) #bunu csv den oku \n",
|
| 259 |
+
" feature_names = vectorizer.get_feature_names_out() #her kelimenin tf-ıdf vektöründeki karşılığını tutar \n",
|
| 260 |
+
" #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
|
| 261 |
+
" \n",
|
| 262 |
+
" top_keywords_per_document = [] #her döküman için en iyi keywordsleri alır\n",
|
| 263 |
+
" top_tfidf_scores_per_document = [] #tf-ıdf değeri en yüksek olan dökümanlar\n",
|
| 264 |
"\n",
|
| 265 |
+
" # Her dökümanı işleme\n",
|
| 266 |
+
" for row in X:\n",
|
| 267 |
+
" tfidf_scores = row.toarray().flatten() #değişkenleri düz bir değişken haline getirme\n",
|
| 268 |
+
" top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
|
| 269 |
+
" \n",
|
| 270 |
+
" #en yüksek skorlu kelimleri ve skorları bul\n",
|
| 271 |
+
" top_keywords = [feature_names[i] for i in top_indices]\n",
|
| 272 |
+
" top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n",
|
| 273 |
+
" \n",
|
| 274 |
+
" top_keywords_per_document.append(top_keywords)\n",
|
| 275 |
+
" top_tfidf_scores_per_document.append(top_tfidf_scores)\n",
|
| 276 |
+
" \n",
|
| 277 |
+
" return top_keywords_per_document, top_tfidf_scores_per_document\n",
|
| 278 |
+
"\n",
|
| 279 |
+
"# Anahtar kelimeleri çıkar ve sonuçları al\n",
|
| 280 |
+
"top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined, stop_words_list, top_n=10)\n",
|
| 281 |
+
" \n",
|
| 282 |
+
"\n",
|
| 283 |
+
"# Sonuçları görüntüleme\n",
|
| 284 |
+
"for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
|
| 285 |
+
" print(f\"Döküman {i+1}:\")\n",
|
| 286 |
+
" for keyword, score in zip(keywords, scores):\n",
|
| 287 |
+
" print(f\"{keyword}: {score:.4f}\")\n",
|
| 288 |
+
" print(\"\\n\")\n"
|
| 289 |
]
|
| 290 |
},
|
| 291 |
{
|
| 292 |
+
"cell_type": "code",
|
| 293 |
+
"execution_count": 5,
|
| 294 |
"metadata": {},
|
| 295 |
+
"outputs": [
|
| 296 |
+
{
|
| 297 |
+
"name": "stderr",
|
| 298 |
+
"output_type": "stream",
|
| 299 |
+
"text": [
|
| 300 |
+
"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['leh'] not in stop_words.\n",
|
| 301 |
+
" warnings.warn(\n"
|
| 302 |
+
]
|
| 303 |
+
},
|
| 304 |
+
{
|
| 305 |
+
"ename": "KeyboardInterrupt",
|
| 306 |
+
"evalue": "",
|
| 307 |
+
"output_type": "error",
|
| 308 |
+
"traceback": [
|
| 309 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 310 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
| 311 |
+
"Cell \u001b[1;32mIn[5], line 53\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords_per_document, top_tfidf_scores_per_document\n\u001b[0;32m 52\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeleri çıkar ve sonuçları al\u001b[39;00m\n\u001b[1;32m---> 53\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcombined\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_n\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;66;03m# Sonuçları görüntüleme\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, (keywords, scores) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mzip\u001b[39m(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
|
| 312 |
+
"Cell \u001b[1;32mIn[5], line 45\u001b[0m, in \u001b[0;36mextract_keywords_tfidf\u001b[1;34m(combined, stop_words_list, top_n, n_jobs)\u001b[0m\n\u001b[0;32m 42\u001b[0m top_tfidf_scores \u001b[38;5;241m=\u001b[39m [tfidf_scores[i] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m top_indices]\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords, top_tfidf_scores\n\u001b[1;32m---> 45\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_row\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrow\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 47\u001b[0m \u001b[38;5;66;03m# Sonuçları listelere ayırma\u001b[39;00m\n\u001b[0;32m 48\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39mresults)\n",
|
| 313 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:2007\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 2001\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[0;32m 2002\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[0;32m 2003\u001b[0m \u001b[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001b[39;00m\n\u001b[0;32m 2004\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[0;32m 2005\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 2007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 314 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:1650\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[1;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[0;32m 1647\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[0;32m 1649\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[1;32m-> 1650\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[0;32m 1652\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[0;32m 1653\u001b[0m \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[0;32m 1654\u001b[0m \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[0;32m 1655\u001b[0m \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[0;32m 1656\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
|
| 315 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:1762\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1757\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[0;32m 1758\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n\u001b[0;32m 1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ((\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m 1760\u001b[0m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mget_status(\n\u001b[0;32m 1761\u001b[0m timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout) \u001b[38;5;241m==\u001b[39m TASK_PENDING)):\n\u001b[1;32m-> 1762\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1763\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m 1765\u001b[0m \u001b[38;5;66;03m# We need to be careful: the job list can be filling up as\u001b[39;00m\n\u001b[0;32m 1766\u001b[0m \u001b[38;5;66;03m# we empty it and Python list are not thread-safe by\u001b[39;00m\n\u001b[0;32m 1767\u001b[0m \u001b[38;5;66;03m# default hence the use of the lock\u001b[39;00m\n",
|
| 316 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
| 317 |
+
]
|
| 318 |
+
}
|
| 319 |
+
],
|
| 320 |
"source": [
|
| 321 |
+
"import re \n",
|
| 322 |
+
"import pandas as pd\n",
|
| 323 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
| 324 |
+
"from joblib import Parallel, delayed\n",
|
| 325 |
+
"\n",
|
| 326 |
+
"\n",
|
| 327 |
+
"# CSV dosyasını okuma\n",
|
| 328 |
+
"df = pd.read_csv('combined_texts.csv')\n",
|
| 329 |
+
"combined = df['combined'].tolist()\n",
|
| 330 |
+
"\n",
|
| 331 |
+
"\n",
|
| 332 |
+
"def pad_or_truncate(texts,max_lenght):\n",
|
| 333 |
+
" \"metinleri belirli bir uzunluğua kısaltır ve padler\"\n",
|
| 334 |
+
" padded_texts=[]\n",
|
| 335 |
+
" for text in texts:\n",
|
| 336 |
+
" words= text.split()\n",
|
| 337 |
+
" if len(words)> max_lenght:\n",
|
| 338 |
+
" padded_texts.append(''.join(words[:max_lenght]))\n",
|
| 339 |
+
" else:\n",
|
| 340 |
+
" padded_texts.append(' '.join(words + [''] * (max_length - len(words))))\n",
|
| 341 |
+
" return padded_texts\n",
|
| 342 |
+
"\n",
|
| 343 |
+
"# Padding uzunluğu\n",
|
| 344 |
+
"max_length = 300 # Örneğin, metin uzunluğunu 300 kelimeyle sınırlandırma\n",
|
| 345 |
+
"\n",
|
| 346 |
+
"# Metinleri pad etme veya kısaltma\n",
|
| 347 |
+
"combined_padded = pad_or_truncate(combined, max_length)\n",
|
| 348 |
+
"\n",
|
| 349 |
+
"def parse_text(text):\n",
|
| 350 |
+
" \"\"\"Verilen metni ayrıştırarak düzenli bir yapıya dönüştürür.\"\"\"\n",
|
| 351 |
+
" # Satırları ayır\n",
|
| 352 |
+
" lines = text.split('|-')\n",
|
| 353 |
+
" \n",
|
| 354 |
+
" data = []\n",
|
| 355 |
+
" for line in lines:\n",
|
| 356 |
+
" line = line.strip()\n",
|
| 357 |
+
" if not line or line.startswith(\"align\"):\n",
|
| 358 |
+
" continue\n",
|
| 359 |
+
"\n",
|
| 360 |
+
" # Satırı parçalara ayır\n",
|
| 361 |
+
" parts = re.split(r'\\s*\\|\\s*', line) #satırları nasıl parçalara ayırır ??\n",
|
| 362 |
+
" \n",
|
| 363 |
+
" # Verileri temizle ve yapıyı oluştur\n",
|
| 364 |
+
" if len(parts) >= 2: # season ve team neler ve neden değişken olarak tanadı.\n",
|
| 365 |
+
" season = parts[0].strip()\n",
|
| 366 |
+
" team = parts[1].strip()\n",
|
| 367 |
+
" stats = [p.strip() for p in parts[2:] if p.strip()]\n",
|
| 368 |
+
" data.append([season, team] + stats)\n",
|
| 369 |
+
"\n",
|
| 370 |
+
" return data \n",
|
| 371 |
+
"\n",
|
| 372 |
+
"def clean_data(file_path):\n",
|
| 373 |
+
" \"\"\"CSV dosyasını okur ve veriyi düzenler.\"\"\"\n",
|
| 374 |
+
" with open(file_path, 'r') as file:\n",
|
| 375 |
+
" raw_text = file.read()\n",
|
| 376 |
+
" \n",
|
| 377 |
+
" data = parse_text(raw_text)\n",
|
| 378 |
+
" \n",
|
| 379 |
+
" # Veri çerçevesi oluştur\n",
|
| 380 |
+
" df = pd.DataFrame(data, columns=['kaynakça'])\n",
|
| 381 |
+
" \n",
|
| 382 |
+
" return df\n",
|
| 383 |
+
"\n",
|
| 384 |
+
"# CSV dosyasını temizleyip düzenli bir DataFrame oluştur\n",
|
| 385 |
+
"cleaned_df = clean_data('combined_texts.csv')\n",
|
| 386 |
+
"\n",
|
| 387 |
+
"# Düzenlenmiş veriyi kontrol et\n",
|
| 388 |
+
"print(cleaned_df.head())\n",
|
| 389 |
+
"\n",
|
| 390 |
+
"def extract_keywords_tfidf(combined, stop_words_list, top_n=10, n_jobs=-1):\n",
|
| 391 |
+
" \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır ve paralel işlem yapar.\"\"\"\n",
|
| 392 |
+
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
| 393 |
+
" \n",
|
| 394 |
+
" # TF-IDF matrisini oluşturma (CPU kullanımını optimize etmek için n_jobs kullanılır)\n",
|
| 395 |
+
" X = vectorizer.fit_transform(combined) # bunu csv'den oku\n",
|
| 396 |
+
" feature_names = vectorizer.get_feature_names_out() # Her kelimenin tf-idf vektöründeki karşılığını tutar\n",
|
| 397 |
+
"\n",
|
| 398 |
+
" # Her döküman için en iyi keywords'leri ve tf-idf değerlerini paralel işlemeyle bulma\n",
|
| 399 |
+
" def process_row(row):\n",
|
| 400 |
+
" tfidf_scores = row.toarray().flatten() # Düz bir değişken haline getirme\n",
|
| 401 |
+
" top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
|
| 402 |
+
" \n",
|
| 403 |
+
" # En yüksek skorlu kelimeleri ve skorları bul\n",
|
| 404 |
+
" top_keywords = [feature_names[i] for i in top_indices]\n",
|
| 405 |
+
" top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n",
|
| 406 |
+
" return top_keywords, top_tfidf_scores\n",
|
| 407 |
+
"\n",
|
| 408 |
+
" results = Parallel(n_jobs=n_jobs)(delayed(process_row)(row) for row in X)\n",
|
| 409 |
+
"\n",
|
| 410 |
+
" # Sonuçları listelere ayırma\n",
|
| 411 |
+
" top_keywords_per_document, top_tfidf_scores_per_document = zip(*results)\n",
|
| 412 |
+
"\n",
|
| 413 |
+
" return top_keywords_per_document, top_tfidf_scores_per_document\n",
|
| 414 |
+
"\n",
|
| 415 |
+
"# Anahtar kelimeleri çıkar ve sonuçları al\n",
|
| 416 |
+
"top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined, stop_words_list, top_n=10, n_jobs=-1)\n",
|
| 417 |
+
"\n",
|
| 418 |
+
"# Sonuçları görüntüleme\n",
|
| 419 |
+
"for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
|
| 420 |
+
" print(f\"Döküman {i+1}:\")\n",
|
| 421 |
+
" for keyword, score in zip(keywords, scores):\n",
|
| 422 |
+
" print(f\"{keyword}: {score:.4f}\")\n",
|
| 423 |
+
" print(\"\\n\")\n"
|
| 424 |
]
|
| 425 |
},
|
| 426 |
{
|
|
|
|
| 428 |
"execution_count": null,
|
| 429 |
"metadata": {},
|
| 430 |
"outputs": [],
|
| 431 |
+
"source": [
|
| 432 |
+
"from sentence_transformers import SentenceTransformer\n",
|
| 433 |
+
"\n",
|
| 434 |
+
"model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')\n",
|
| 435 |
+
"\n",
|
| 436 |
+
"# Top_keywords embedding\n",
|
| 437 |
+
"keyword_embeddings = model.encode(top_keywords_per_document)\n"
|
| 438 |
+
]
|
| 439 |
+
},
|
| 440 |
+
{
|
| 441 |
+
"cell_type": "code",
|
| 442 |
+
"execution_count": 3,
|
| 443 |
+
"metadata": {},
|
| 444 |
+
"outputs": [
|
| 445 |
+
{
|
| 446 |
+
"name": "stdout",
|
| 447 |
+
"output_type": "stream",
|
| 448 |
+
"text": [
|
| 449 |
+
"Keyword: bir, Similarity: 0.26726124191242445\n",
|
| 450 |
+
"Keyword: anahtar, Similarity: 0.26726124191242445\n",
|
| 451 |
+
"Keyword: kelimeleri, Similarity: 0.26726124191242445\n",
|
| 452 |
+
"Keyword: test, Similarity: 0.26726124191242445\n",
|
| 453 |
+
"Keyword: başka, Similarity: 0.0\n"
|
| 454 |
+
]
|
| 455 |
+
}
|
| 456 |
+
],
|
| 457 |
+
"source": [
|
| 458 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
| 459 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
| 460 |
+
"\n",
|
| 461 |
+
"def calculate_keyword_similarity(text, keywords):\n",
|
| 462 |
+
" # TF-IDF matrisini oluştur\n",
|
| 463 |
+
" tfidf_vectorizer = TfidfVectorizer()\n",
|
| 464 |
+
"\n",
|
| 465 |
+
" #texti ve anahtar kelimeleri tf-ıdf vektörlerine dönüştür\n",
|
| 466 |
+
" text_tfidf = tfidf_vectorizer.fit_transform([text]) #burayı combined sütunundan almalıyım\n",
|
| 467 |
+
" #benzerlik hesaplama \n",
|
| 468 |
+
" similarity_array = []\n",
|
| 469 |
+
" for keyword in keywords:\n",
|
| 470 |
+
" # Her bir anahtar kelimeyi TF-IDF vektörüne dönüştür\n",
|
| 471 |
+
" keyword_tfidf = tfidf_vectorizer.transform([keyword]) #keywordleri teker teker alma fonksiyonu\n",
|
| 472 |
+
" \n",
|
| 473 |
+
" # Cosine similarity ile benzerlik hesapla\n",
|
| 474 |
+
" similarity = cosine_similarity(text_tfidf, keyword_tfidf)[0][0]\n",
|
| 475 |
+
" \n",
|
| 476 |
+
" # Anahtar kelime ve benzerlik skorunu kaydet\n",
|
| 477 |
+
" similarity_array.append((keyword, similarity))\n",
|
| 478 |
+
" \n",
|
| 479 |
+
" return similarity_array\n",
|
| 480 |
+
" \n",
|
| 481 |
+
"\n",
|
| 482 |
+
"# Örnek metin ve anahtar kelimeler\n",
|
| 483 |
+
"#combined verileri \n",
|
| 484 |
+
"text = \"Bu bir örnek metindir ve bu metin üzerinde anahtar kelimeleri test ediyoruz.\"\n",
|
| 485 |
+
"keywords = [\"başka\", \"bir\", \"anahtar\", \"kelimeleri\", \"test\"] #bu keywordsler tf-değerinden alınarak arraylere çevrilmeli \n",
|
| 486 |
+
" \n",
|
| 487 |
+
"# Uygunluk skorunu hesapla\n",
|
| 488 |
+
"similarity_results = calculate_keyword_similarity(text, keywords)\n",
|
| 489 |
+
"top_5_keywords = sorted(similarity_results, key=lambda x: x[1], reverse=True)[:5]\n",
|
| 490 |
+
"# Her bir anahtar kelimenin uyumluluk skorunu yazdır\n",
|
| 491 |
+
"\n",
|
| 492 |
+
"for keyword, similarity in top_5_keywords:\n",
|
| 493 |
+
" print(f\"Keyword: {keyword}, Similarity: {similarity}\")\n",
|
| 494 |
+
" #print(f\"Keyword: '{keyword}' - Relevance score: {score:.4f}\")\n",
|
| 495 |
+
"\n"
|
| 496 |
+
]
|
| 497 |
+
},
|
| 498 |
+
{
|
| 499 |
+
"cell_type": "code",
|
| 500 |
+
"execution_count": 10,
|
| 501 |
+
"metadata": {},
|
| 502 |
+
"outputs": [
|
| 503 |
+
{
|
| 504 |
+
"data": {
|
| 505 |
+
"text/plain": [
|
| 506 |
+
"<function __main__.process_texts(combined_texts, stop_words_list, top_n)>"
|
| 507 |
+
]
|
| 508 |
+
},
|
| 509 |
+
"execution_count": 10,
|
| 510 |
+
"metadata": {},
|
| 511 |
+
"output_type": "execute_result"
|
| 512 |
+
}
|
| 513 |
+
],
|
| 514 |
"source": [
|
| 515 |
"\n",
|
| 516 |
"# BERT Tokenizer ve Model'i yükleyin\n",
|
|
|
|
| 520 |
"def extract_keywords_tfidf(combined_texts, stop_words_list,top_n=5):\n",
|
| 521 |
" \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
|
| 522 |
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
| 523 |
+
" X = vectorizer.fit_transform(combined_texts) #bunu csv den oku \n",
|
| 524 |
" feature_names = vectorizer.get_feature_names_out()\n",
|
| 525 |
" #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
|
| 526 |
+
" \n",
|
| 527 |
+
" top_keywords_per_document = [] #her döküman için en iyi keywordsleri alır\n",
|
| 528 |
+
" top_tfidf_scores_per_document = [] #tf-ıdf değeri en yüksek olan dökümanlar\n",
|
| 529 |
+
"#------------------------------------------------------------------------------------------\n",
|
| 530 |
" for row in X:\n",
|
| 531 |
" tfidf_scores = row.toarray().flatten()\n",
|
| 532 |
" top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
|
| 533 |
" top_keywords = [feature_names[i] for i in top_indices]\n",
|
| 534 |
+
" top_tfidf_scores_per_document = [tfidf_scores[i] for i in top_indices]\n",
|
| 535 |
+
"\n",
|
| 536 |
"\n",
|
| 537 |
+
" top_tfidf_scores_per_document.append(top_tfidf_scores)\n",
|
| 538 |
+
" top_keywords_per_document.append(top_keywords)\n",
|
| 539 |
+
" \n",
|
| 540 |
" return top_keywords_per_document\n",
|
| 541 |
"\n",
|
| 542 |
"# Anahtar kelimeleri çıkar ve BERT ile embedding oluştur\n",
|
|
|
|
| 545 |
" \n",
|
| 546 |
" for text in combined_texts:\n",
|
| 547 |
" # Anahtar kelimeleri çıkar\n",
|
| 548 |
+
" keywords = extract_keywords_tfidf(text, stop_words_list,top_n=15)\n",
|
| 549 |
" \n",
|
| 550 |
" # BERT ile embedding oluştur\n",
|
| 551 |
" inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)\n",
|
|
|
|
| 559 |
" 'embedding': embeddings\n",
|
| 560 |
" })\n",
|
| 561 |
" \n",
|
| 562 |
+
" return results\n",
|
| 563 |
+
"\n",
|
| 564 |
+
"results=process_texts\n",
|
| 565 |
+
"results\n",
|
| 566 |
+
"#tüm metinleri işle\n"
|
| 567 |
+
]
|
| 568 |
+
},
|
| 569 |
+
{
|
| 570 |
+
"cell_type": "markdown",
|
| 571 |
+
"metadata": {},
|
| 572 |
+
"source": [
|
| 573 |
+
"MongoDb'den database'in çekilmesi"
|
| 574 |
]
|
| 575 |
},
|
| 576 |
{
|
| 577 |
"cell_type": "code",
|
| 578 |
+
"execution_count": 5,
|
| 579 |
"metadata": {},
|
| 580 |
"outputs": [
|
| 581 |
{
|
| 582 |
"name": "stdout",
|
| 583 |
"output_type": "stream",
|
| 584 |
"text": [
|
| 585 |
+
"combined metinler 'combined_texts.csv' dosyasına başarıyla yazıldı.\n"
|
| 586 |
]
|
| 587 |
}
|
| 588 |
],
|
| 589 |
"source": [
|
| 590 |
+
"#mongodb üzerinden combined_textleri çek\n",
|
| 591 |
+
"import csv\n",
|
| 592 |
+
"from pymongo import MongoClient\n",
|
|
|
|
|
|
|
|
|
|
| 593 |
"\n",
|
| 594 |
+
"def mongo_db_combined_texts_to_csv(database_name='combined_text', collection_name='text', host='localhost', port=27017,batch_size=1000,output_file='combined_texts.csv'):\n",
|
| 595 |
+
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
| 596 |
+
" db = client[database_name]\n",
|
| 597 |
+
" collection = db[collection_name]\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
" \n",
|
| 599 |
+
" #toplam döküman sayısını al\n",
|
| 600 |
+
" total_documents = collection.count_documents({})\n",
|
| 601 |
+
" #batch_documents = []\n",
|
| 602 |
"\n",
|
| 603 |
+
" # CSV dosyasını aç ve yazmaya hazırla\n",
|
| 604 |
+
" with open(output_file, mode='w', newline='', encoding='utf-8') as file:\n",
|
| 605 |
+
" writer = csv.writer(file)\n",
|
| 606 |
+
" writer.writerow([\"combined\"]) # CSV başlığı\n",
|
| 607 |
"\n",
|
| 608 |
+
" # Belirtilen batch_size kadar dökümanları almak için döngü\n",
|
| 609 |
+
" for i in range(0, total_documents, batch_size):\n",
|
| 610 |
+
" cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
|
| 611 |
+
" combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc] #combined sütununa ilişkin verileri çeker \n",
|
| 612 |
+
"\n",
|
| 613 |
+
" # Batch verilerini CSV'ye yaz\n",
|
| 614 |
+
" with open(output_file, mode='a', newline='', encoding='utf-8') as file:\n",
|
| 615 |
+
" writer = csv.writer(file)\n",
|
| 616 |
+
" \n",
|
| 617 |
+
" for text in combined_texts:\n",
|
| 618 |
+
" writer.writerow([text])\n",
|
| 619 |
+
" \n",
|
| 620 |
+
" \n",
|
| 621 |
"\n",
|
| 622 |
+
" print(f\"combined metinler '{output_file}' dosyasına başarıyla yazıldı.\")\n",
|
| 623 |
+
"\n",
|
| 624 |
+
"# Dökümanları CSV dosyasına yazdır\n",
|
| 625 |
+
"text=mongo_db_combined_texts_to_csv(batch_size=1000)\n",
|
| 626 |
+
" #batch_documents.extend((combined_texts, len(combined_texts)))\n",
|
| 627 |
+
" #append fonksiyonu listenin içerisine tek bir eleman gibi ekler yani list1 = [1, 2, 3, [4, 5]]\n",
|
| 628 |
+
" #fakat extend fonksiyonu list1 = [1, 2, 3, 4, 5] bir listeye yeni bir liste eklemeyi teker teker gerçekleştirir.\n",
|
| 629 |
+
" #return batch_documents\n",
|
| 630 |
+
"\n",
|
| 631 |
+
"# Dökümanları ve döküman sayısını batch olarak çekin\n",
|
| 632 |
+
"#combined_texts = mongo_db_combined_texts(batch_size=1000)\n",
|
| 633 |
+
"\n",
|
| 634 |
+
"# Her batch'i ayrı ayrı işleyebilirsiniz\n",
|
| 635 |
+
"#print(f\"Toplam döküman sayısı:{len(combined_texts)}\")\n",
|
| 636 |
+
"\n",
|
| 637 |
+
"#for index, text in enumerate (combined_texts[:10]):\n",
|
| 638 |
+
" #print(f\"Döküman {index + 1}: {text}\")\n",
|
| 639 |
+
"\n",
|
| 640 |
+
"#print(combined_texts)\n",
|
| 641 |
+
"\n",
|
| 642 |
+
" "
|
| 643 |
+
]
|
| 644 |
+
},
|
| 645 |
+
{
|
| 646 |
+
"cell_type": "markdown",
|
| 647 |
+
"metadata": {},
|
| 648 |
+
"source": [
|
| 649 |
+
"Gereksiz kelimelerin 'gereksiz_kelimeler.txt' üzerinden import edilmesi"
|
| 650 |
+
]
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"cell_type": "markdown",
|
| 654 |
+
"metadata": {},
|
| 655 |
+
"source": [
|
| 656 |
+
"TF-IDF Skorları "
|
| 657 |
]
|
| 658 |
},
|
| 659 |
{
|
processed_data.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97993e92396f03aa90162dad808bbd3c655a988b37d7ba45704b0058371c6172
|
| 3 |
+
size 419458630
|