Spaces:

yonkasoft
/

makaleChatbotu

Build error

App Files Files Community

yonkasoft commited on Aug 26, 2024

Commit

cba6b49

verified ·

1 Parent(s): fa8e9f4

Upload 2 files

Browse files

Files changed (3) hide show

.gitattributes +1 -0
cleaned_processed_data.csv +3 -0
combined.ipynb +378 -140

.gitattributes CHANGED Viewed

@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 combined_output.csv filter=lfs diff=lfs merge=lfs -text
 combined_texts.csv filter=lfs diff=lfs merge=lfs -text
 processed_data.csv filter=lfs diff=lfs merge=lfs -text

 combined_output.csv filter=lfs diff=lfs merge=lfs -text
 combined_texts.csv filter=lfs diff=lfs merge=lfs -text
 processed_data.csv filter=lfs diff=lfs merge=lfs -text
+cleaned_processed_data.csv filter=lfs diff=lfs merge=lfs -text

cleaned_processed_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0827f1e2337aaf3e75ccbed508fd13dcb2d26612a1a55a8922d25e77fc54dd85
+size 391939173

combined.ipynb CHANGED Viewed

@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -40,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -138,15 +138,15 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
-   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -204,13 +204,6 @@
     "save_to_csv(truncated_texts, output_file)\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -220,79 +213,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['leh'] not in stop_words.\n",
-      "  warnings.warn(\n"
      ]
     },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[11], line 33\u001b[0m\n\u001b[0;32m     30\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords_per_document, top_tfidf_scores_per_document\n\u001b[0;32m     32\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeleri çıkar ve sonuçları al\u001b[39;00m\n\u001b[1;32m---> 33\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcombined\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_n\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m     36\u001b[0m \u001b[38;5;66;03m# Sonuçları görüntüleme\u001b[39;00m\n\u001b[0;32m     37\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, (keywords, scores) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mzip\u001b[39m(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
-      "Cell \u001b[1;32mIn[11], line 21\u001b[0m, in \u001b[0;36mextract_keywords_tfidf\u001b[1;34m(combined, stop_words_list, top_n)\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m X:\n\u001b[0;32m     20\u001b[0m     tfidf_scores \u001b[38;5;241m=\u001b[39m row\u001b[38;5;241m.\u001b[39mtoarray()\u001b[38;5;241m.\u001b[39mflatten() \u001b[38;5;66;03m#değişkenleri düz bir değişken haline getirme\u001b[39;00m\n\u001b[1;32m---> 21\u001b[0m     top_indices \u001b[38;5;241m=\u001b[39m \u001b[43mtfidf_scores\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margsort\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m-\u001b[39mtop_n:][::\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]  \u001b[38;5;66;03m# En yüksek n skoru bul\u001b[39;00m\n\u001b[0;32m     23\u001b[0m     \u001b[38;5;66;03m#en yüksek skorlu kelimleri ve skorları bul\u001b[39;00m\n\u001b[0;32m     24\u001b[0m     top_keywords \u001b[38;5;241m=\u001b[39m [feature_names[i] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m top_indices]\n",
-      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "import csv\n",
-    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
-    "from joblib import Parallel, delayed\n",
-    "import pandas as pd\n",
-    "\n",
-    "df=pd.read_csv('combined_texts.csv')\n",
-    "combined= df['combined'].tolist()\n",
-    "def extract_keywords_tfidf(combined, stop_words_list,top_n=10):\n",
-    "    \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
-    "    vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
-    "    X = vectorizer.fit_transform(combined) #bunu csv den oku \n",
-    "    feature_names = vectorizer.get_feature_names_out() #her kelimenin tf-ıdf vektöründeki karşılığını tutar \n",
-    "    #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
-    "    \n",
-    "    top_keywords_per_document = [] #her döküman için en iyi keywordsleri alır\n",
-    "    top_tfidf_scores_per_document = [] #tf-ıdf değeri en yüksek olan dökümanlar\n",
-    "\n",
-    "    # Her dökümanı işleme\n",
-    "    for row in X:\n",
-    "        tfidf_scores = row.toarray().flatten() #değişkenleri düz bir değişken haline getirme\n",
-    "        top_indices = tfidf_scores.argsort()[-top_n:][::-1]  # En yüksek n skoru bul\n",
-    "        \n",
-    "        #en yüksek skorlu kelimleri ve skorları bul\n",
-    "        top_keywords = [feature_names[i] for i in top_indices]\n",
-    "        top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n",
-    "        \n",
-    "        top_keywords_per_document.append(top_keywords)\n",
-    "        top_tfidf_scores_per_document.append(top_tfidf_scores)\n",
-    "        \n",
-    "    return top_keywords_per_document, top_tfidf_scores_per_document\n",
-    "\n",
-    "# Anahtar kelimeleri çıkar ve sonuçları al\n",
-    "top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined, stop_words_list, top_n=10)\n",
-    " \n",
-    "\n",
-    "# Sonuçları görüntüleme\n",
-    "for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
-    "    print(f\"Döküman {i+1}:\")\n",
-    "    for keyword, score in zip(keywords, scores):\n",
-    "        print(f\"{keyword}: {score:.4f}\")\n",
-    "    print(\"\\n\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
@@ -300,21 +251,6 @@
       "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['leh'] not in stop_words.\n",
       "  warnings.warn(\n"
      ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[5], line 53\u001b[0m\n\u001b[0;32m     50\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords_per_document, top_tfidf_scores_per_document\n\u001b[0;32m     52\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeleri çıkar ve sonuçları al\u001b[39;00m\n\u001b[1;32m---> 53\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcombined\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_n\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m     55\u001b[0m \u001b[38;5;66;03m# Sonuçları görüntüleme\u001b[39;00m\n\u001b[0;32m     56\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, (keywords, scores) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mzip\u001b[39m(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
-      "Cell \u001b[1;32mIn[5], line 45\u001b[0m, in \u001b[0;36mextract_keywords_tfidf\u001b[1;34m(combined, stop_words_list, top_n, n_jobs)\u001b[0m\n\u001b[0;32m     42\u001b[0m     top_tfidf_scores \u001b[38;5;241m=\u001b[39m [tfidf_scores[i] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m top_indices]\n\u001b[0;32m     43\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords, top_tfidf_scores\n\u001b[1;32m---> 45\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_row\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrow\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     47\u001b[0m \u001b[38;5;66;03m# Sonuçları listelere ayırma\u001b[39;00m\n\u001b[0;32m     48\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39mresults)\n",
-      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:2007\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m   2001\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[0;32m   2002\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[0;32m   2003\u001b[0m \u001b[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001b[39;00m\n\u001b[0;32m   2004\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[0;32m   2005\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 2007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:1650\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[1;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[0;32m   1647\u001b[0m     \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[0;32m   1649\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[1;32m-> 1650\u001b[0m         \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[0;32m   1652\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[0;32m   1653\u001b[0m     \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[0;32m   1654\u001b[0m     \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[0;32m   1655\u001b[0m     \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[0;32m   1656\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:1762\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1757\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[0;32m   1758\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n\u001b[0;32m   1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ((\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m   1760\u001b[0m     (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mget_status(\n\u001b[0;32m   1761\u001b[0m         timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout) \u001b[38;5;241m==\u001b[39m TASK_PENDING)):\n\u001b[1;32m-> 1762\u001b[0m     \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1763\u001b[0m     \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m   1765\u001b[0m \u001b[38;5;66;03m# We need to be careful: the job list can be filling up as\u001b[39;00m\n\u001b[0;32m   1766\u001b[0m \u001b[38;5;66;03m# we empty it and Python list are not thread-safe by\u001b[39;00m\n\u001b[0;32m   1767\u001b[0m \u001b[38;5;66;03m# default hence the use of the lock\u001b[39;00m\n",
-      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
-     ]
     }
    ],
    "source": [
@@ -322,10 +258,13 @@
     "import pandas as pd\n",
     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "from joblib import Parallel, delayed\n",
     "\n",
     "\n",
     "# CSV dosyasını okuma\n",
-    "df = pd.read_csv('combined_texts.csv')\n",
     "combined = df['combined'].tolist()\n",
     "\n",
     "\n",
@@ -371,18 +310,18 @@
     "\n",
     "def clean_data(file_path):\n",
     "    \"\"\"CSV dosyasını okur ve veriyi düzenler.\"\"\"\n",
-    "    with open(file_path, 'r') as file:\n",
     "        raw_text = file.read()\n",
     "    \n",
     "    data = parse_text(raw_text)\n",
     "    \n",
     "    # Veri çerçevesi oluştur\n",
-    "    df = pd.DataFrame(data, columns=['kaynakça'])\n",
     "    \n",
     "    return df\n",
     "\n",
     "# CSV dosyasını temizleyip düzenli bir DataFrame oluştur\n",
-    "cleaned_df = clean_data('combined_texts.csv')\n",
     "\n",
     "# Düzenlenmiş veriyi kontrol et\n",
     "print(cleaned_df.head())\n",
@@ -405,7 +344,7 @@
     "        top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n",
     "        return top_keywords, top_tfidf_scores\n",
     "\n",
-    "    results = Parallel(n_jobs=n_jobs)(delayed(process_row)(row) for row in X)\n",
     "\n",
     "    # Sonuçları listelere ayırma\n",
     "    top_keywords_per_document, top_tfidf_scores_per_document = zip(*results)\n",
@@ -413,14 +352,81 @@
     "    return top_keywords_per_document, top_tfidf_scores_per_document\n",
     "\n",
     "# Anahtar kelimeleri çıkar ve sonuçları al\n",
-    "top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined, stop_words_list, top_n=10, n_jobs=-1)\n",
     "\n",
     "# Sonuçları görüntüleme\n",
     "for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
     "    print(f\"Döküman {i+1}:\")\n",
     "    for keyword, score in zip(keywords, scores):\n",
     "        print(f\"{keyword}: {score:.4f}\")\n",
-    "    print(\"\\n\")\n"
    ]
   },
   {
@@ -437,23 +443,18 @@
     "keyword_embeddings = model.encode(top_keywords_per_document)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Keyword: bir, Similarity: 0.26726124191242445\n",
-      "Keyword: anahtar, Similarity: 0.26726124191242445\n",
-      "Keyword: kelimeleri, Similarity: 0.26726124191242445\n",
-      "Keyword: test, Similarity: 0.26726124191242445\n",
-      "Keyword: başka, Similarity: 0.0\n"
-     ]
-    }
-   ],
    "source": [
     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "from sklearn.metrics.pairwise import cosine_similarity\n",
@@ -481,6 +482,8 @@
     "\n",
     "# Örnek metin ve anahtar kelimeler\n",
     "#combined verileri \n",
     "text = \"Bu bir örnek metindir ve bu metin üzerinde anahtar kelimeleri test ediyoruz.\"\n",
     "keywords = [\"başka\", \"bir\", \"anahtar\", \"kelimeleri\", \"test\"] #bu keywordsler tf-değerinden alınarak arraylere çevrilmeli \n",
     "    \n",
@@ -497,20 +500,79 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<function __main__.process_texts(combined_texts, stop_words_list, top_n)>"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "\n",
     "# BERT Tokenizer ve Model'i yükleyin\n",
@@ -575,17 +637,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "combined metinler 'combined_texts.csv' dosyasına başarıyla yazıld��.\n"
-     ]
-    }
-   ],
    "source": [
     "#mongodb üzerinden combined_textleri çek\n",
     "import csv\n",
@@ -824,13 +878,197 @@
     "    print(f\"Keyword: {keyword}, Similarity: {similarity}\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    " "
    ]
   }
  ],

   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
    ]
   },
   {
+   "cell_type": "markdown",
    "metadata": {},
+   "source": [
+    "Metinleri Kısaltma Fonksiyonu (processed_data kaydetme)"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "save_to_csv(truncated_texts, output_file)\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
+     "name": "stdout",
      "output_type": "stream",
      "text": [
+      "  0     1              2                3              4                5    \\\n",
+      "0      1992        Hitachi  Football League              6                0   \n",
+      "1                        6                0           None             None   \n",
+      "2      1993  rowspan=\"\"3\"\"   Kashiwa Reysol  rowspan=\"\"2\"\"  Football League   \n",
+      "3      1994              0                0              0                0   \n",
+      "4      1995      J1 League               17              1                2   \n",
+      "\n",
+      "             6              7     8     9    ...   204   205   206   207  \\\n",
+      "0  colspan=\"\"2\"\"           None  None  None  ...  None  None  None  None   \n",
+      "1           None           None  None  None  ...  None  None  None  None   \n",
+      "2             12              5     1     0  ...  None  None  None  None   \n",
+      "3              0              0     0     0  ...  None  None  None  None   \n",
+      "4              0  colspan=\"\"2\"\"  None  None  ...  None  None  None  None   \n",
+      "\n",
+      "    208   209   210   211   212   213  \n",
+      "0  None  None  None  None  None  None  \n",
+      "1  None  None  None  None  None  None  \n",
+      "2  None  None  None  None  None  None  \n",
+      "3  None  None  None  None  None  None  \n",
+      "4  None  None  None  None  None  None  \n",
+      "\n",
+      "[5 rows x 214 columns]\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
       "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['leh'] not in stop_words.\n",
       "  warnings.warn(\n"
      ]
     }
    ],
    "source": [
     "import pandas as pd\n",
     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "from joblib import Parallel, delayed\n",
+    "from tqdm import tqdm\n",
+    "import csv\n",
+    "\n",
     "\n",
     "\n",
     "# CSV dosyasını okuma\n",
+    "df = pd.read_csv('processed_data.csv')\n",
     "combined = df['combined'].tolist()\n",
     "\n",
     "\n",
     "\n",
     "def clean_data(file_path):\n",
     "    \"\"\"CSV dosyasını okur ve veriyi düzenler.\"\"\"\n",
+    "    with open(file_path, 'r',encoding='utf-8') as file:\n",
     "        raw_text = file.read()\n",
     "    \n",
     "    data = parse_text(raw_text)\n",
     "    \n",
     "    # Veri çerçevesi oluştur\n",
+    "    df = pd.DataFrame(data)\n",
     "    \n",
     "    return df\n",
     "\n",
     "# CSV dosyasını temizleyip düzenli bir DataFrame oluştur\n",
+    "cleaned_df = clean_data('processed_data.csv')\n",
     "\n",
     "# Düzenlenmiş veriyi kontrol et\n",
     "print(cleaned_df.head())\n",
     "        top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n",
     "        return top_keywords, top_tfidf_scores\n",
     "\n",
+    "    results = Parallel(n_jobs=n_jobs)(delayed(process_row)(row) for row in tqdm(X))\n",
     "\n",
     "    # Sonuçları listelere ayırma\n",
     "    top_keywords_per_document, top_tfidf_scores_per_document = zip(*results)\n",
     "    return top_keywords_per_document, top_tfidf_scores_per_document\n",
     "\n",
     "# Anahtar kelimeleri çıkar ve sonuçları al\n",
+    "# İlk 100 dökümanı işleyin\n",
+    "combined_sample = combined[:400000]\n",
+    "top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined_sample, stop_words_list, top_n=10, n_jobs=-1)\n",
+    "#n__jobs ın 2 olması aynı anda iki iş parçacığı yani iki işlem yanı anda yürütülür \n",
+    "#n__jobs ın -1 olması maksimum işlemci sayısının kullanılmasıdır.\n",
+    "\n",
+    "#Sonuçları CSV dosyasına kaydetme\n",
+    "with open('keywords_with_scores.csv', mode='w', newline='', encoding='utf-8') as file:\n",
+    "    writer = csv.writer(file)\n",
+    "    # Başlık satırını yazma\n",
+    "    writer.writerow(['Document_Index'] + [f'Keyword_{i+1}' for i in range(10)] + [f'Score_{i+1}' for i in range(10)])\n",
+    "    \n",
+    "    # Her döküman için anahtar kelimeler ve skorları yazma\n",
+    "    for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
+    "        row = [i+1] + keywords + [f\"{score:.4f}\" for score in scores]\n",
+    "        writer.writerow(row)\n",
     "\n",
+    "print(\"Sonuçlar 'keywords_with_scores.csv' dosyasına kaydedildi.\")\n",
+    "\"\"\"\n",
     "# Sonuçları görüntüleme\n",
     "for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
     "    print(f\"Döküman {i+1}:\")\n",
     "    for keyword, score in zip(keywords, scores):\n",
     "        print(f\"{keyword}: {score:.4f}\")\n",
+    "    print(\"\\n\")\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Buradaki keywords ve skorlar yukarıda çekildi."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import csv\n",
+    "\n",
+    "# Anahtar kelimeleri ve TF-IDF skorlarını çekme\n",
+    "top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined, stop_words_list, top_n=10, n_jobs=-1)\n",
+    "\n",
+    "# Sonuçları tablo şeklinde hazırlama\n",
+    "results_top = []\n",
+    "for keywords, scores in zip(top_keywords_per_document, top_tfidf_scores_per_document):\n",
+    "    row = {}\n",
+    "    for i, (keyword, score) in enumerate(zip(keywords, scores)):\n",
+    "        row[f'Keyword_{i+1}'] = keyword\n",
+    "        row[f'Score_{i+1}'] = score\n",
+    "    results_top.append(row)\n",
+    "\n",
+    "# Sonuçları DataFrame'e dönüştürme\n",
+    "df = pd.DataFrame(results_top)\n",
+    "\n",
+    "# Sonuçları CSV'ye kaydetme\n",
+    "df.to_csv('keywords_with_scores.csv', index=False, encoding='utf-8')\n",
+    "\n",
+    "chunksize = 1000  # Küçük bir parça boyutu belirleyin\n",
+    "for i in range(0, len(df), chunksize):\n",
+    "    df.iloc[i:i+chunksize].to_csv('keywords_with_scores.csv', mode='a', header=(i==0), index=False, encoding='utf-8')\n",
+    "\n",
+    "# Sonuçları terminalde görüntüleme\n",
+    "print(df.head())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Encoding yapmak için"
    ]
   },
   {
     "keyword_embeddings = model.encode(top_keywords_per_document)\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Text ve keywords similarity denemesi"
+   ]
+  },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "from sklearn.metrics.pairwise import cosine_similarity\n",
     "\n",
     "# Örnek metin ve anahtar kelimeler\n",
     "#combined verileri \n",
+    "\n",
+    "\n",
     "text = \"Bu bir örnek metindir ve bu metin üzerinde anahtar kelimeleri test ediyoruz.\"\n",
     "keywords = [\"başka\", \"bir\", \"anahtar\", \"kelimeleri\", \"test\"] #bu keywordsler tf-değerinden alınarak arraylere çevrilmeli \n",
     "    \n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "# Örnek metin ve anahtar kelimeler\n",
+    "#combined verileri \n",
+    "def get_text(file_path='processed_data.csv'):\n",
+    "    \"\"\"CSV dosyasını okur ve veriyi düzenler.\"\"\"\n",
+    "    with open(file_path, 'r',encoding='utf-8') as file:\n",
+    "        raw_text = file.read()\n",
+    "    \n",
+    "    text = parse_text(raw_text)\n",
+    "    \n",
+    "    # Veri çerçevesi oluştur\n",
+    "    df_text = pd.DataFrame(text)\n",
+    "    \n",
+    "    return df_text\n",
+    "\n",
+    "def get_keywords(file_path='keywords_with_scores.csv'):\n",
+    "    \"\"\"CSV dosyasını okur ve veriyi düzenler.\"\"\"\n",
+    "    with open(file_path, 'r',encoding='utf-8') as file:\n",
+    "        raw_text = file.read()\n",
+    "    \n",
+    "    keywords = parse_text(raw_text)\n",
+    "    \n",
+    "    # Veri çerçevesi oluştur\n",
+    "    df_keyword = pd.DataFrame(keywords)\n",
+    "    \n",
+    "    return df_keyword\n",
+    "\n",
+    "\n",
+    "def calculate_keyword_similarity(text, keywords):\n",
+    "    # TF-IDF matrisini oluştur\n",
+    "    tfidf_vectorizer = TfidfVectorizer()\n",
+    "\n",
+    "    #texti ve anahtar kelimeleri tf-ıdf vektörlerine dönüştür\n",
+    "    text_tfidf = tfidf_vectorizer.fit_transform(text) #burayı combined sütunundan almalıyım\n",
+    "    #benzerlik hesaplama \n",
+    "    similarity_array = []\n",
+    "    for keyword in keywords:\n",
+    "        # Her bir anahtar kelimeyi TF-IDF vektörüne dönüştür\n",
+    "        keyword_tfidf = tfidf_vectorizer.transform([keyword]) #keywordleri teker teker alma fonksiyonu\n",
+    "        \n",
+    "        # Cosine similarity ile benzerlik hesapla\n",
+    "        similarity = cosine_similarity(text_tfidf, keyword_tfidf)[0][0]\n",
+    "        \n",
+    "        # Anahtar kelime ve benzerlik skorunu kaydet\n",
+    "        similarity_array.append((keyword, similarity))\n",
+    "    \n",
+    "    return similarity_array\n",
+    "   \n",
+    "\n",
+    "\n",
+    "\n",
+    "    \n",
+    "# Uygunluk skorunu hesapla\n",
+    "similarity_results = calculate_keyword_similarity(text, keywords)\n",
+    "top_5_keywords = sorted(similarity_results, key=lambda x: x[1], reverse=True)[:5]\n",
+    "# Her bir anahtar kelimenin uyumluluk skorunu yazdır\n",
+    "\n",
+    "for keyword, similarity in top_5_keywords:\n",
+    "    print(f\"Keyword: {keyword}, Similarity: {similarity}\")\n",
+    "    #print(f\"Keyword: '{keyword}' - Relevance score: {score:.4f}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "\n",
     "# BERT Tokenizer ve Model'i yükleyin\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "#mongodb üzerinden combined_textleri çek\n",
     "import csv\n",
     "    print(f\"Keyword: {keyword}, Similarity: {similarity}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Title değerini bir dataframe' e dönüştürür."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "metin başlıkları 'titles_texts.csv' dosyasına başarıyla yazıldı.\n",
+      "             title\n",
+      "0    Pşıqo Ahecaqo\n",
+      "1  Craterolophinae\n",
+      "2       Notocrabro\n",
+      "3  Ibrahim Sissoko\n",
+      "4      Salah Cedid\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pymongo import MongoClient\n",
+    "import pandas as pd\n",
+    "import csv\n",
+    "\n",
+    "# MongoDB'ye bağlanma\n",
+    "\n",
+    "def get_titles(database_name='combined_text', collection_name='text', host='localhost', port=27017,batch_size=1000,output_file='titles_texts.csv'):\n",
+    "    client = MongoClient(f'mongodb://{host}:{port}/')\n",
+    "    db = client[database_name]\n",
+    "    collection = db[collection_name]\n",
+    "    \n",
+    "    #toplam döküman sayısını al\n",
+    "    total_documents = collection.count_documents({})\n",
+    "    #batch_documents = []\n",
+    "\n",
+    "\n",
+    "    # MongoDB'den sadece title alanlarını çekme\n",
+    "    titles = collection.find({}, {\"_id\": 0, \"title\": 1})\n",
+    "\n",
+    "    # Verileri liste haline getirme ve DataFrame'e dönüştürme\n",
+    "    df = pd.DataFrame(list(titles))\n",
+    "\n",
+    "    \n",
+    "    # CSV dosyasını aç ve yazmaya hazırla\n",
+    "    with open(output_file, mode='w', newline='', encoding='utf-8') as file:\n",
+    "        writer = csv.writer(file)\n",
+    "        writer.writerow([\"titles\"])  # CSV başlığı\n",
+    "\n",
+    "    # Belirtilen batch_size kadar dökümanları almak için döngü\n",
+    "    for i in range(0, total_documents, batch_size):\n",
+    "        cursor = collection.find({}, {\"title\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
+    "        combined_texts = [doc['title'] for doc in cursor if 'title' in doc] #combined sütununa ilişkin verileri çeker \n",
+    "\n",
+    "        # Batch verilerini CSV'ye yaz\n",
+    "        with open(output_file, mode='a', newline='', encoding='utf-8') as file:\n",
+    "            writer = csv.writer(file)\n",
+    "            \n",
+    "            for text in combined_texts:\n",
+    "                writer.writerow([text])\n",
+    "        \n",
+    "        \n",
+    "\n",
+    "    print(f\"metin başlıkları '{output_file}' dosyasına başarıyla yazıldı.\")\n",
+    "\n",
+    "    # DataFrame'i görüntüleme\n",
+    "    print(df.head())\n",
+    "\n",
+    "# Dökümanları CSV dosyasına yazdır\n",
+    "text=get_titles(batch_size=5000)\n",
+    "        #batch_documents.extend((combined_texts, len(combined_texts)))\n",
+    "    #append fonksiyonu listenin içerisine tek bir eleman gibi ekler yani list1 = [1, 2, 3, [4, 5]]\n",
+    "    #fakat extend fonksiyonu list1 = [1, 2, 3, 4, 5] bir listeye yeni bir liste eklemeyi teker teker gerçekleştirir.\n",
+    "    #return batch_documents\n",
+    "\n",
+    "# Dökümanları ve döküman sayısını batch olarak çekin\n",
+    "#combined_texts = mongo_db_combined_texts(batch_size=1000)\n",
+    "\n",
+    "# Her batch'i ayrı ayrı işleyebilirsiniz\n",
+    "#print(f\"Toplam döküman sayısı:{len(combined_texts)}\")\n",
+    "\n",
+    "#for index, text in enumerate (combined_texts[:10]):\n",
+    "    #print(f\"Döküman {index + 1}: {text}\")\n",
+    "\n",
+    "#print(combined_texts)\n",
+    "\n",
+    "    \n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Veri güncelleme "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Document_Index        Keyword_1    Keyword_2      Keyword_3  \\\n",
+      "0               1          ahecaqo        pşıqo         çerkes   \n",
+      "1               2  craterolophinae  depastridae  craterolophus   \n",
+      "2               3       notocrabro   crabronina       oymağına   \n",
+      "3               4          sissoko    wolfsburg  panathinaikos   \n",
+      "4               5             baas        cedid          salah   \n",
+      "\n",
+      "         Keyword_4    Keyword_5 Keyword_6     Keyword_7  Keyword_8 Keyword_9  \\\n",
+      "0         çerkesya         1777  savaşına     lakapları         qo    bjeduğ   \n",
+      "1  altfamilyasıdır        clark      1863       cinsler  taksonomi      2023   \n",
+      "2          cinstir  bağlantılar  kaynakça         ghost     ghetto      ghez   \n",
+      "3        konyaspor    deportivo   étienne        coruña  kiralandı  imzaladı   \n",
+      "4             1970         1993      1926  siyasetçiler  fraksiyon     bitar   \n",
+      "\n",
+      "   ... Score_1  Score_2  Score_3  Score_4  Score_5  Score_6  Score_7  Score_8  \\\n",
+      "0  ...  0.5162   0.4130   0.3481   0.1903   0.1850   0.1740   0.1032   0.1032   \n",
+      "1  ...  0.7030   0.4687   0.2343   0.2052   0.2011   0.1808   0.1745   0.1583   \n",
+      "2  ...  0.6762   0.6762   0.2125   0.1782   0.0714   0.0588   0.0000   0.0000   \n",
+      "3  ...  0.8107   0.2490   0.1245   0.1159   0.1159   0.1139   0.1121   0.1065   \n",
+      "4  ...  0.5065   0.4892   0.2026   0.1679   0.1610   0.1403   0.1205   0.1062   \n",
+      "\n",
+      "   Score_9  Score_10  \n",
+      "0   0.1032    0.1032  \n",
+      "1   0.1555    0.1458  \n",
+      "2   0.0000    0.0000  \n",
+      "3   0.0913    0.0896  \n",
+      "4   0.1062    0.1062  \n",
+      "\n",
+      "[5 rows x 21 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Örnek TF-IDF skoru ve anahtar kelimeler\n",
+    "keyword_data = pd.read_csv('keywords_with_scores.csv')\n",
+    "\n",
+    "df = pd.DataFrame(keyword_data)\n",
+    "print(df.head())\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import pandas as pd\n",
+    "from langdetect import detect, DetectorFactory\n",
+    "\n",
+    "DetectorFactory.seed = 0  # Her zaman aynı sonuçları almak için\n",
+    "\n",
+    "def is_turkish(text):\n",
+    "    try:\n",
+    "        return detect(text) == 'tr'\n",
+    "    except:\n",
+    "        return False\n",
+    "\n",
+    "def filter_turkish_keywords(text):\n",
+    "    if pd.isna(text):\n",
+    "        return []  # NaN değerleri boş liste olarak döndür\n",
+    "    keywords = text.split(',')  # Anahtar kelimeleri virgülle ayır\n",
+    "    return [kw.strip() for kw in keywords if is_turkish(kw.strip())]\n",
+    "\n",
+    "# CSV dosyasını oku\n",
+    "df = pd.read_csv('path_to_your_file.csv')\n",
+    "\n",
+    "# Anahtar kelime sütunlarını belirle\n",
+    "keyword_columns = ['Keyword_1', 'Keyword_2', 'Keyword_3', 'Keyword_4', 'Keyword_5', \n",
+    "                   'Keyword_6', 'Keyword_7', 'Keyword_8', 'Keyword_9', 'Keyword_10']\n",
+    "\n",
+    "# Her anahtar kelime sütunu için Türkçe olanları filtrele\n",
+    "for col in keyword_columns:\n",
+    "    df[f'{col}_Turkish'] = df[col].apply(filter_turkish_keywords)\n",
+    "\n",
+    "print(df.head())\n"
    ]
   }
  ],