Spaces:

yonkasoft
/

makaleChatbotu

Build error

App Files Files Community

yonkasoft commited on Aug 23, 2024

Commit

fa8e9f4

verified ·

1 Parent(s): 8e61200

Upload 2 files

Browse files

Files changed (3) hide show

.gitattributes +1 -0
combined.ipynb +512 -79
processed_data.csv +3 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 combined_output.csv filter=lfs diff=lfs merge=lfs -text
 combined_texts.csv filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 combined_output.csv filter=lfs diff=lfs merge=lfs -text
 combined_texts.csv filter=lfs diff=lfs merge=lfs -text
+processed_data.csv filter=lfs diff=lfs merge=lfs -text

combined.ipynb CHANGED Viewed

@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,87 +35,392 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "MongoDb'den database'in çekilmesi"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
     "#mongodb üzerinden combined_textleri çek\n",
     "\n",
-    "def mongo_db_combined_texts(database_name='combined', collection_name='combined_output', host='localhost', port=27017,batch_size=1000):\n",
     "    client = MongoClient(f'mongodb://{host}:{port}/')\n",
     "    db = client[database_name]\n",
     "    collection = db[collection_name]\n",
     "    \n",
     "    #toplam döküman sayısını al\n",
     "    total_documents = collection.count_documents({})\n",
-    "    batch_documents = []\n",
     "\n",
     "    # Belirtilen batch_size kadar dökümanları almak için döngü\n",
     "    for i in range(0, total_documents, batch_size):\n",
     "        cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
-    "        combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc]\n",
-    "        batch_documents.append((combined_texts, len(combined_texts)))\n",
-    "    \n",
-    "    return batch_documents\n",
     "\n",
     "# Dökümanları ve döküman sayısını batch olarak çekin\n",
-    "batch_documents = mongo_db_combined_texts(batch_size=1000)\n",
     "\n",
     "# Her batch'i ayrı ayrı işleyebilirsiniz\n",
-    "for batch_index, (combined_texts, document_count) in enumerate(batch_documents):\n",
-    "    print(f\"Batch {batch_index + 1}: {document_count} documents\")\n",
     "\n",
     "    "
    ]
   },
   {
-   "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "Gereksiz kelimelerin 'gereksiz_kelimeler.txt' üzerinden import edilmesi"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "\"\"\"\"\"\"\n",
-    "#- burada turkish_stop_words'ü alıyoruz\n",
-    "def load_stop_words(file_path, existing_stop_words='gereksiz_kelimeler.txt'):\n",
-    "    \"\"\"Stop words'leri dosyadan okuyarak bir liste oluşturur. \n",
-    "    Mevcut stop words'ler varsa bunları dikkate alır.\"\"\"\n",
     "    \n",
-    "    if existing_stop_words is None:\n",
-    "        existing_stop_words = set()\n",
-    "    else:\n",
-    "        existing_stop_words = set(existing_stop_words)\n",
     "    \n",
-    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
-    "        for line in file:\n",
-    "            word = line.strip()\n",
-    "            if word and word not in existing_stop_words:\n",
-    "                existing_stop_words.add(word)\n",
     "    \n",
-    "    return list(existing_stop_words)\n",
     "\n",
-    "# Mevcut stop words'leri kontrol ederek Türkçe stop words dosyasını yükleyin\n",
-    "stop_words_list = load_stop_words('gereksiz_kelimeler.txt')\n",
     "\n",
-    "#----------------------------------------------------------------------------------------------------"
    ]
   },
   {
-   "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "TF-IDF Skorları "
    ]
   },
   {
@@ -123,6 +428,89 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
     "# BERT Tokenizer ve Model'i yükleyin\n",
@@ -132,17 +520,23 @@
     "def extract_keywords_tfidf(combined_texts, stop_words_list,top_n=5):\n",
     "    \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
     "    vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
-    "    X = vectorizer.fit_transform(combined_texts)\n",
     "    feature_names = vectorizer.get_feature_names_out()\n",
     "    #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
-    "    top_keywords_per_document = []\n",
-    "\n",
     "    for row in X:\n",
     "        tfidf_scores = row.toarray().flatten()\n",
     "        top_indices = tfidf_scores.argsort()[-top_n:][::-1]  # En yüksek n skoru bul\n",
     "        top_keywords = [feature_names[i] for i in top_indices]\n",
-    "        top_keywords_per_document.append(top_keywords)\n",
     "\n",
     "    return top_keywords_per_document\n",
     "\n",
     "# Anahtar kelimeleri çıkar ve BERT ile embedding oluştur\n",
@@ -151,7 +545,7 @@
     "    \n",
     "    for text in combined_texts:\n",
     "        # Anahtar kelimeleri çıkar\n",
-    "        keywords = extract_keywords_tfidf(text, stop_words_list,top_n)\n",
     "        \n",
     "        # BERT ile embedding oluştur\n",
     "        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)\n",
@@ -165,62 +559,101 @@
     "            'embedding': embeddings\n",
     "        })\n",
     "        \n",
-    "    return results"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Keyword: test, Similarity: 0.19360324687858618\n"
      ]
     }
    ],
    "source": [
-    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
-    "from sklearn.metrics.pairwise import cosine_similarity\n",
-    "\n",
-    "def calculate_keyword_similarity(text, keywords):\n",
-    "    # Metin ve anahtar kelimelerden oluşan bir liste oluştur\n",
-    "    similarity_array = []\n",
     "\n",
-    "    for keyword in keywords:\n",
-    "        # Metin ve anahtar kelimeyi bir listeye ekle\n",
-    "        documents = [text, keyword]\n",
-    "    \n",
-    "    # TF-IDF matrisini oluştur\n",
-    "    vectorizer = TfidfVectorizer()\n",
-    "    tfidf_matrix = vectorizer.fit_transform(documents)\n",
-    "    \n",
-    "    # Metin vektörünü ve anahtar kelimeler vektörünü al\n",
-    "    text_vector = tfidf_matrix[0]\n",
-    "    keywords_vector = tfidf_matrix[1]\n",
     "    \n",
-    "    # Cosine similarity ile benzerlik hesapla\n",
-    "    similarity = cosine_similarity(text_vector, keywords_vector)[0][0]\n",
     "\n",
     "\n",
-    "    similarity_array.append((keyword,similarity))\n",
-    "    \n",
-    "    return similarity_array\n",
-    "# Örnek metin ve anahtar kelimeler\n",
-    "#combined verileri \n",
-    "text = \"Bu bir örnek metindir ve bu metin üzerinde anahtar kelimeleri test ediyoruz.\"\n",
-    "keywords = [\"başka\", \"bir\", \"anahtar\", \"kelimeleri\", \"test\"]\n",
-    "# Uygunluk skorunu hesapla\n",
-    "similarity_results = calculate_keyword_similarity(text, keywords)\n",
-    "top_5_keywords = sorted(similarity_results, key=lambda x: x[1], reverse=True)[:5]\n",
-    "# Her bir anahtar kelimenin uyumluluk skorunu yazdır\n",
     "\n",
-    "for keyword, similarity in top_5_keywords:\n",
-    "    print(f\"Keyword: {keyword}, Similarity: {similarity}\")\n",
-    "    #print(f\"Keyword: '{keyword}' - Relevance score: {score:.4f}\")\n",
-    "\n"
    ]
   },
   {

   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Turkish stop wordslerin tanımlanması"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\"\"\"\n",
+    "#- burada turkish_stop_words'ü alıyoruz\n",
+    "def load_stop_words(file_path, existing_stop_words='gereksiz_kelimeler.txt'):\n",
+    "    \"\"\"Stop words'leri dosyadan okuyarak bir liste oluşturur. \n",
+    "    Mevcut stop words'ler varsa bunları dikkate alır.\"\"\"\n",
+    "    \n",
+    "    if existing_stop_words is None:\n",
+    "        existing_stop_words = set()\n",
+    "    else:\n",
+    "        existing_stop_words = set(existing_stop_words)\n",
+    "    \n",
+    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
+    "        for line in file:\n",
+    "            word = line.strip()\n",
+    "            if word and word not in existing_stop_words:\n",
+    "                existing_stop_words.add(word)\n",
+    "    \n",
+    "    return list(existing_stop_words)\n",
+    "\n",
+    "# Mevcut stop words'leri kontrol ederek Türkçe stop words dosyasını yükleyin\n",
+    "stop_words_list = load_stop_words('gereksiz_kelimeler.txt')\n",
+    "\n",
+    "#----------------------------------------------------------------------------------------------------"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "MongoDb'deki combined_text koleksiyonunun verilerini csv ye çekme "
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "#mongodb üzerinden combined_textleri çek\n",
+    "import csv\n",
+    "from pymongo import MongoClient\n",
     "\n",
+    "def mongo_db_combined_texts_to_csv(database_name='combined_text', collection_name='text', host='localhost', port=27017,batch_size=100,output_file='combined_texts.csv'):\n",
     "    client = MongoClient(f'mongodb://{host}:{port}/')\n",
     "    db = client[database_name]\n",
     "    collection = db[collection_name]\n",
     "    \n",
     "    #toplam döküman sayısını al\n",
     "    total_documents = collection.count_documents({})\n",
+    "    #batch_documents = []\n",
+    "\n",
+    "    # CSV dosyasını aç ve yazmaya hazırla\n",
+    "    with open(output_file, mode='w', newline='', encoding='utf-8') as file:\n",
+    "        writer = csv.writer(file)\n",
+    "        writer.writerow([\"combined\"])  # CSV başlığı\n",
     "\n",
     "    # Belirtilen batch_size kadar dökümanları almak için döngü\n",
     "    for i in range(0, total_documents, batch_size):\n",
     "        cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
+    "        combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc] #combined sütununa ilişkin verileri çeker \n",
+    "\n",
+    "        # Batch verilerini CSV'ye yaz\n",
+    "        with open(output_file, mode='a', newline='', encoding='utf-8') as file:\n",
+    "            writer = csv.writer(file)\n",
+    "            \n",
+    "            for text in combined_texts:\n",
+    "                writer.writerow([text])\n",
+    "        \n",
+    "        \n",
+    "\n",
+    "    print(f\"combined metinler '{output_file}' dosyasına başarıyla yazıldı.\")\n",
+    "\n",
+    "# Dökümanları CSV dosyasına yazdır\n",
+    "text=mongo_db_combined_texts_to_csv(batch_size=100)\n",
+    "        #batch_documents.extend((combined_texts, len(combined_texts)))\n",
+    "    #append fonksiyonu listenin içerisine tek bir eleman gibi ekler yani list1 = [1, 2, 3, [4, 5]]\n",
+    "    #fakat extend fonksiyonu list1 = [1, 2, 3, 4, 5] bir listeye yeni bir liste eklemeyi teker teker gerçekleştirir.\n",
+    "    #return batch_documents\n",
     "\n",
     "# Dökümanları ve döküman sayısını batch olarak çekin\n",
+    "#combined_texts = mongo_db_combined_texts(batch_size=1000)\n",
     "\n",
     "# Her batch'i ayrı ayrı işleyebilirsiniz\n",
+    "#print(f\"Toplam döküman sayısı:{len(combined_texts)}\")\n",
+    "\n",
+    "#for index, text in enumerate (combined_texts[:10]):\n",
+    "    #print(f\"Döküman {index + 1}: {text}\")\n",
+    "\n",
+    "#print(combined_texts)\n",
     "\n",
     "    "
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
+   "source": []
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import csv\n",
+    "from pymongo import MongoClient\n",
+    "import pandas as pd\n",
+    "\n",
+    "def fetch_from_database(database_name='combined_text', collection_name='text', host='localhost', port=27017, batch_size=100):\n",
+    "    client = MongoClient(f'mongodb://{host}:{port}/')\n",
+    "    db = client[database_name]\n",
+    "    collection = db[collection_name]\n",
     "    \n",
+    "    # Toplam döküman sayısını al\n",
+    "    total_documents = collection.count_documents({})\n",
+    "    combined_texts = []\n",
+    "\n",
+    "    # Belirtilen batch_size kadar dökümanları almak için döngü\n",
+    "    for i in range(0, total_documents, batch_size):\n",
+    "        cursor = collection.find({}, {\"combined\": 1, \"_id\": 0}).skip(i).limit(batch_size)\n",
+    "        combined_texts.extend([doc['combined'] for doc in cursor if 'combined' in doc])  # combined sütununa ilişkin verileri çeker \n",
+    "\n",
+    "    return combined_texts\n",
+    "\n",
+    "# Metinleri kısaltma fonksiyonu\n",
+    "def truncate_text(text, max_words=300):\n",
+    "    words = text.split()  # Metni kelimelere böler\n",
+    "    return ' '.join(words[:max_words])  # İlk max_words kadar kelimeyi alır\n",
+    "\n",
+    "# Veritabanından veri çekme ve kısaltma\n",
+    "def fetch_and_truncate_data(database_name, collection_name, host, port, max_words=300):\n",
+    "    # Veritabanından veri çekme\n",
+    "    combined_texts = fetch_from_database(database_name, collection_name, host, port)\n",
     "    \n",
+    "    # Metinleri kısaltma\n",
+    "    truncated_texts = [truncate_text(text, max_words) for text in combined_texts]\n",
     "    \n",
+    "    return truncated_texts\n",
+    "\n",
+    "# Kısaltılmış veriyi CSV'ye kaydetme\n",
+    "def save_to_csv(data, file_path):\n",
+    "    df = pd.DataFrame(data, columns=['combined'])\n",
+    "    df.to_csv(file_path, encoding='utf-8', index=False)\n",
+    "\n",
+    "# Doğru değişken tanımlamaları\n",
+    "database_name = 'combined_text'\n",
+    "collection_name = 'text'\n",
+    "host = 'localhost'\n",
+    "port = 27017\n",
+    "batch_size = 100\n",
+    "max_words = 300\n",
+    "output_file = 'processed_data.csv'\n",
+    "\n",
+    "# Veriyi çekme ve işleme\n",
+    "truncated_texts = fetch_and_truncate_data(database_name, collection_name, host, port, max_words)\n",
+    "save_to_csv(truncated_texts, output_file)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Tf-Idf ile keywordsleri alma "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['leh'] not in stop_words.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[11], line 33\u001b[0m\n\u001b[0;32m     30\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords_per_document, top_tfidf_scores_per_document\n\u001b[0;32m     32\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeleri çıkar ve sonuçları al\u001b[39;00m\n\u001b[1;32m---> 33\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcombined\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_n\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m     36\u001b[0m \u001b[38;5;66;03m# Sonuçları görüntüleme\u001b[39;00m\n\u001b[0;32m     37\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, (keywords, scores) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mzip\u001b[39m(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
+      "Cell \u001b[1;32mIn[11], line 21\u001b[0m, in \u001b[0;36mextract_keywords_tfidf\u001b[1;34m(combined, stop_words_list, top_n)\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m X:\n\u001b[0;32m     20\u001b[0m     tfidf_scores \u001b[38;5;241m=\u001b[39m row\u001b[38;5;241m.\u001b[39mtoarray()\u001b[38;5;241m.\u001b[39mflatten() \u001b[38;5;66;03m#değişkenleri düz bir değişken haline getirme\u001b[39;00m\n\u001b[1;32m---> 21\u001b[0m     top_indices \u001b[38;5;241m=\u001b[39m \u001b[43mtfidf_scores\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margsort\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m-\u001b[39mtop_n:][::\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]  \u001b[38;5;66;03m# En yüksek n skoru bul\u001b[39;00m\n\u001b[0;32m     23\u001b[0m     \u001b[38;5;66;03m#en yüksek skorlu kelimleri ve skorları bul\u001b[39;00m\n\u001b[0;32m     24\u001b[0m     top_keywords \u001b[38;5;241m=\u001b[39m [feature_names[i] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m top_indices]\n",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "import csv\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from joblib import Parallel, delayed\n",
+    "import pandas as pd\n",
     "\n",
+    "df=pd.read_csv('combined_texts.csv')\n",
+    "combined= df['combined'].tolist()\n",
+    "def extract_keywords_tfidf(combined, stop_words_list,top_n=10):\n",
+    "    \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
+    "    vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
+    "    X = vectorizer.fit_transform(combined) #bunu csv den oku \n",
+    "    feature_names = vectorizer.get_feature_names_out() #her kelimenin tf-ıdf vektöründeki karşılığını tutar \n",
+    "    #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
+    "    \n",
+    "    top_keywords_per_document = [] #her döküman için en iyi keywordsleri alır\n",
+    "    top_tfidf_scores_per_document = [] #tf-ıdf değeri en yüksek olan dökümanlar\n",
     "\n",
+    "    # Her dökümanı işleme\n",
+    "    for row in X:\n",
+    "        tfidf_scores = row.toarray().flatten() #değişkenleri düz bir değişken haline getirme\n",
+    "        top_indices = tfidf_scores.argsort()[-top_n:][::-1]  # En yüksek n skoru bul\n",
+    "        \n",
+    "        #en yüksek skorlu kelimleri ve skorları bul\n",
+    "        top_keywords = [feature_names[i] for i in top_indices]\n",
+    "        top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n",
+    "        \n",
+    "        top_keywords_per_document.append(top_keywords)\n",
+    "        top_tfidf_scores_per_document.append(top_tfidf_scores)\n",
+    "        \n",
+    "    return top_keywords_per_document, top_tfidf_scores_per_document\n",
+    "\n",
+    "# Anahtar kelimeleri çıkar ve sonuçları al\n",
+    "top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined, stop_words_list, top_n=10)\n",
+    " \n",
+    "\n",
+    "# Sonuçları görüntüleme\n",
+    "for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
+    "    print(f\"Döküman {i+1}:\")\n",
+    "    for keyword, score in zip(keywords, scores):\n",
+    "        print(f\"{keyword}: {score:.4f}\")\n",
+    "    print(\"\\n\")\n"
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['leh'] not in stop_words.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[5], line 53\u001b[0m\n\u001b[0;32m     50\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords_per_document, top_tfidf_scores_per_document\n\u001b[0;32m     52\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeleri çıkar ve sonuçları al\u001b[39;00m\n\u001b[1;32m---> 53\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcombined\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_n\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m     55\u001b[0m \u001b[38;5;66;03m# Sonuçları görüntüleme\u001b[39;00m\n\u001b[0;32m     56\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, (keywords, scores) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mzip\u001b[39m(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
+      "Cell \u001b[1;32mIn[5], line 45\u001b[0m, in \u001b[0;36mextract_keywords_tfidf\u001b[1;34m(combined, stop_words_list, top_n, n_jobs)\u001b[0m\n\u001b[0;32m     42\u001b[0m     top_tfidf_scores \u001b[38;5;241m=\u001b[39m [tfidf_scores[i] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m top_indices]\n\u001b[0;32m     43\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords, top_tfidf_scores\n\u001b[1;32m---> 45\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_row\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrow\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     47\u001b[0m \u001b[38;5;66;03m# Sonuçları listelere ayırma\u001b[39;00m\n\u001b[0;32m     48\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39mresults)\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:2007\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m   2001\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[0;32m   2002\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[0;32m   2003\u001b[0m \u001b[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001b[39;00m\n\u001b[0;32m   2004\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[0;32m   2005\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 2007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:1650\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[1;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[0;32m   1647\u001b[0m     \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[0;32m   1649\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[1;32m-> 1650\u001b[0m         \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[0;32m   1652\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[0;32m   1653\u001b[0m     \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[0;32m   1654\u001b[0m     \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[0;32m   1655\u001b[0m     \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[0;32m   1656\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:1762\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1757\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[0;32m   1758\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n\u001b[0;32m   1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ((\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m   1760\u001b[0m     (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mget_status(\n\u001b[0;32m   1761\u001b[0m         timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout) \u001b[38;5;241m==\u001b[39m TASK_PENDING)):\n\u001b[1;32m-> 1762\u001b[0m     \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1763\u001b[0m     \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m   1765\u001b[0m \u001b[38;5;66;03m# We need to be careful: the job list can be filling up as\u001b[39;00m\n\u001b[0;32m   1766\u001b[0m \u001b[38;5;66;03m# we empty it and Python list are not thread-safe by\u001b[39;00m\n\u001b[0;32m   1767\u001b[0m \u001b[38;5;66;03m# default hence the use of the lock\u001b[39;00m\n",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
    "source": [
+    "import re \n",
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from joblib import Parallel, delayed\n",
+    "\n",
+    "\n",
+    "# CSV dosyasını okuma\n",
+    "df = pd.read_csv('combined_texts.csv')\n",
+    "combined = df['combined'].tolist()\n",
+    "\n",
+    "\n",
+    "def pad_or_truncate(texts,max_lenght):\n",
+    "    \"metinleri belirli bir uzunluğua kısaltır ve padler\"\n",
+    "    padded_texts=[]\n",
+    "    for text in texts:\n",
+    "        words= text.split()\n",
+    "        if len(words)> max_lenght:\n",
+    "            padded_texts.append(''.join(words[:max_lenght]))\n",
+    "        else:\n",
+    "            padded_texts.append(' '.join(words + [''] * (max_length - len(words))))\n",
+    "    return padded_texts\n",
+    "\n",
+    "# Padding uzunluğu\n",
+    "max_length = 300  # Örneğin, metin uzunluğunu 300 kelimeyle sınırlandırma\n",
+    "\n",
+    "# Metinleri pad etme veya kısaltma\n",
+    "combined_padded = pad_or_truncate(combined, max_length)\n",
+    "\n",
+    "def parse_text(text):\n",
+    "    \"\"\"Verilen metni ayrıştırarak düzenli bir yapıya dönüştürür.\"\"\"\n",
+    "    # Satırları ayır\n",
+    "    lines = text.split('|-')\n",
+    "    \n",
+    "    data = []\n",
+    "    for line in lines:\n",
+    "        line = line.strip()\n",
+    "        if not line or line.startswith(\"align\"):\n",
+    "            continue\n",
+    "\n",
+    "        # Satırı parçalara ayır\n",
+    "        parts = re.split(r'\\s*\\|\\s*', line) #satırları nasıl parçalara ayırır ??\n",
+    "        \n",
+    "        # Verileri temizle ve yapıyı oluştur\n",
+    "        if len(parts) >= 2: # season ve team neler ve neden değişken olarak tanadı.\n",
+    "            season = parts[0].strip()\n",
+    "            team = parts[1].strip()\n",
+    "            stats = [p.strip() for p in parts[2:] if p.strip()]\n",
+    "            data.append([season, team] + stats)\n",
+    "\n",
+    "    return data \n",
+    "\n",
+    "def clean_data(file_path):\n",
+    "    \"\"\"CSV dosyasını okur ve veriyi düzenler.\"\"\"\n",
+    "    with open(file_path, 'r') as file:\n",
+    "        raw_text = file.read()\n",
+    "    \n",
+    "    data = parse_text(raw_text)\n",
+    "    \n",
+    "    # Veri çerçevesi oluştur\n",
+    "    df = pd.DataFrame(data, columns=['kaynakça'])\n",
+    "    \n",
+    "    return df\n",
+    "\n",
+    "# CSV dosyasını temizleyip düzenli bir DataFrame oluştur\n",
+    "cleaned_df = clean_data('combined_texts.csv')\n",
+    "\n",
+    "# Düzenlenmiş veriyi kontrol et\n",
+    "print(cleaned_df.head())\n",
+    "\n",
+    "def extract_keywords_tfidf(combined, stop_words_list, top_n=10, n_jobs=-1):\n",
+    "    \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır ve paralel işlem yapar.\"\"\"\n",
+    "    vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
+    "    \n",
+    "    # TF-IDF matrisini oluşturma (CPU kullanımını optimize etmek için n_jobs kullanılır)\n",
+    "    X = vectorizer.fit_transform(combined)  # bunu csv'den oku\n",
+    "    feature_names = vectorizer.get_feature_names_out()  # Her kelimenin tf-idf vektöründeki karşılığını tutar\n",
+    "\n",
+    "    # Her döküman için en iyi keywords'leri ve tf-idf değerlerini paralel işlemeyle bulma\n",
+    "    def process_row(row):\n",
+    "        tfidf_scores = row.toarray().flatten()  # Düz bir değişken haline getirme\n",
+    "        top_indices = tfidf_scores.argsort()[-top_n:][::-1]  # En yüksek n skoru bul\n",
+    "        \n",
+    "        # En yüksek skorlu kelimeleri ve skorları bul\n",
+    "        top_keywords = [feature_names[i] for i in top_indices]\n",
+    "        top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n",
+    "        return top_keywords, top_tfidf_scores\n",
+    "\n",
+    "    results = Parallel(n_jobs=n_jobs)(delayed(process_row)(row) for row in X)\n",
+    "\n",
+    "    # Sonuçları listelere ayırma\n",
+    "    top_keywords_per_document, top_tfidf_scores_per_document = zip(*results)\n",
+    "\n",
+    "    return top_keywords_per_document, top_tfidf_scores_per_document\n",
+    "\n",
+    "# Anahtar kelimeleri çıkar ve sonuçları al\n",
+    "top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined, stop_words_list, top_n=10, n_jobs=-1)\n",
+    "\n",
+    "# Sonuçları görüntüleme\n",
+    "for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
+    "    print(f\"Döküman {i+1}:\")\n",
+    "    for keyword, score in zip(keywords, scores):\n",
+    "        print(f\"{keyword}: {score:.4f}\")\n",
+    "    print(\"\\n\")\n"
    ]
   },
   {
    "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')\n",
+    "\n",
+    "# Top_keywords embedding\n",
+    "keyword_embeddings = model.encode(top_keywords_per_document)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Keyword: bir, Similarity: 0.26726124191242445\n",
+      "Keyword: anahtar, Similarity: 0.26726124191242445\n",
+      "Keyword: kelimeleri, Similarity: 0.26726124191242445\n",
+      "Keyword: test, Similarity: 0.26726124191242445\n",
+      "Keyword: başka, Similarity: 0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "\n",
+    "def calculate_keyword_similarity(text, keywords):\n",
+    "    # TF-IDF matrisini oluştur\n",
+    "    tfidf_vectorizer = TfidfVectorizer()\n",
+    "\n",
+    "    #texti ve anahtar kelimeleri tf-ıdf vektörlerine dönüştür\n",
+    "    text_tfidf = tfidf_vectorizer.fit_transform([text]) #burayı combined sütunundan almalıyım\n",
+    "    #benzerlik hesaplama \n",
+    "    similarity_array = []\n",
+    "    for keyword in keywords:\n",
+    "        # Her bir anahtar kelimeyi TF-IDF vektörüne dönüştür\n",
+    "        keyword_tfidf = tfidf_vectorizer.transform([keyword]) #keywordleri teker teker alma fonksiyonu\n",
+    "        \n",
+    "        # Cosine similarity ile benzerlik hesapla\n",
+    "        similarity = cosine_similarity(text_tfidf, keyword_tfidf)[0][0]\n",
+    "        \n",
+    "        # Anahtar kelime ve benzerlik skorunu kaydet\n",
+    "        similarity_array.append((keyword, similarity))\n",
+    "    \n",
+    "    return similarity_array\n",
+    "   \n",
+    "\n",
+    "# Örnek metin ve anahtar kelimeler\n",
+    "#combined verileri \n",
+    "text = \"Bu bir örnek metindir ve bu metin üzerinde anahtar kelimeleri test ediyoruz.\"\n",
+    "keywords = [\"başka\", \"bir\", \"anahtar\", \"kelimeleri\", \"test\"] #bu keywordsler tf-değerinden alınarak arraylere çevrilmeli \n",
+    "    \n",
+    "# Uygunluk skorunu hesapla\n",
+    "similarity_results = calculate_keyword_similarity(text, keywords)\n",
+    "top_5_keywords = sorted(similarity_results, key=lambda x: x[1], reverse=True)[:5]\n",
+    "# Her bir anahtar kelimenin uyumluluk skorunu yazdır\n",
+    "\n",
+    "for keyword, similarity in top_5_keywords:\n",
+    "    print(f\"Keyword: {keyword}, Similarity: {similarity}\")\n",
+    "    #print(f\"Keyword: '{keyword}' - Relevance score: {score:.4f}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<function __main__.process_texts(combined_texts, stop_words_list, top_n)>"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "\n",
     "# BERT Tokenizer ve Model'i yükleyin\n",
     "def extract_keywords_tfidf(combined_texts, stop_words_list,top_n=5):\n",
     "    \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
     "    vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
+    "    X = vectorizer.fit_transform(combined_texts) #bunu csv den oku \n",
     "    feature_names = vectorizer.get_feature_names_out()\n",
     "    #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
+    "    \n",
+    "    top_keywords_per_document = [] #her döküman için en iyi keywordsleri alır\n",
+    "    top_tfidf_scores_per_document = [] #tf-ıdf değeri en yüksek olan dökümanlar\n",
+    "#------------------------------------------------------------------------------------------\n",
     "    for row in X:\n",
     "        tfidf_scores = row.toarray().flatten()\n",
     "        top_indices = tfidf_scores.argsort()[-top_n:][::-1]  # En yüksek n skoru bul\n",
     "        top_keywords = [feature_names[i] for i in top_indices]\n",
+    "        top_tfidf_scores_per_document = [tfidf_scores[i] for i in top_indices]\n",
+    "\n",
     "\n",
+    "        top_tfidf_scores_per_document.append(top_tfidf_scores)\n",
+    "        top_keywords_per_document.append(top_keywords)\n",
+    "        \n",
     "    return top_keywords_per_document\n",
     "\n",
     "# Anahtar kelimeleri çıkar ve BERT ile embedding oluştur\n",
     "    \n",
     "    for text in combined_texts:\n",
     "        # Anahtar kelimeleri çıkar\n",
+    "        keywords = extract_keywords_tfidf(text, stop_words_list,top_n=15)\n",
     "        \n",
     "        # BERT ile embedding oluştur\n",
     "        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)\n",
     "            'embedding': embeddings\n",
     "        })\n",
     "        \n",
+    "    return results\n",
+    "\n",
+    "results=process_texts\n",
+    "results\n",
+    "#tüm metinleri işle\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "MongoDb'den database'in çekilmesi"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "combined metinler 'combined_texts.csv' dosyasına başarıyla yazıldı.\n"
      ]
     }
    ],
    "source": [
+    "#mongodb üzerinden combined_textleri çek\n",
+    "import csv\n",
+    "from pymongo import MongoClient\n",
     "\n",
+    "def mongo_db_combined_texts_to_csv(database_name='combined_text', collection_name='text', host='localhost', port=27017,batch_size=1000,output_file='combined_texts.csv'):\n",
+    "    client = MongoClient(f'mongodb://{host}:{port}/')\n",
+    "    db = client[database_name]\n",
+    "    collection = db[collection_name]\n",
     "    \n",
+    "    #toplam döküman sayısını al\n",
+    "    total_documents = collection.count_documents({})\n",
+    "    #batch_documents = []\n",
     "\n",
+    "    # CSV dosyasını aç ve yazmaya hazırla\n",
+    "    with open(output_file, mode='w', newline='', encoding='utf-8') as file:\n",
+    "        writer = csv.writer(file)\n",
+    "        writer.writerow([\"combined\"])  # CSV başlığı\n",
     "\n",
+    "    # Belirtilen batch_size kadar dökümanları almak için döngü\n",
+    "    for i in range(0, total_documents, batch_size):\n",
+    "        cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
+    "        combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc] #combined sütununa ilişkin verileri çeker \n",
+    "\n",
+    "        # Batch verilerini CSV'ye yaz\n",
+    "        with open(output_file, mode='a', newline='', encoding='utf-8') as file:\n",
+    "            writer = csv.writer(file)\n",
+    "            \n",
+    "            for text in combined_texts:\n",
+    "                writer.writerow([text])\n",
+    "        \n",
+    "        \n",
     "\n",
+    "    print(f\"combined metinler '{output_file}' dosyasına başarıyla yazıldı.\")\n",
+    "\n",
+    "# Dökümanları CSV dosyasına yazdır\n",
+    "text=mongo_db_combined_texts_to_csv(batch_size=1000)\n",
+    "        #batch_documents.extend((combined_texts, len(combined_texts)))\n",
+    "    #append fonksiyonu listenin içerisine tek bir eleman gibi ekler yani list1 = [1, 2, 3, [4, 5]]\n",
+    "    #fakat extend fonksiyonu list1 = [1, 2, 3, 4, 5] bir listeye yeni bir liste eklemeyi teker teker gerçekleştirir.\n",
+    "    #return batch_documents\n",
+    "\n",
+    "# Dökümanları ve döküman sayısını batch olarak çekin\n",
+    "#combined_texts = mongo_db_combined_texts(batch_size=1000)\n",
+    "\n",
+    "# Her batch'i ayrı ayrı işleyebilirsiniz\n",
+    "#print(f\"Toplam döküman sayısı:{len(combined_texts)}\")\n",
+    "\n",
+    "#for index, text in enumerate (combined_texts[:10]):\n",
+    "    #print(f\"Döküman {index + 1}: {text}\")\n",
+    "\n",
+    "#print(combined_texts)\n",
+    "\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Gereksiz kelimelerin 'gereksiz_kelimeler.txt' üzerinden import edilmesi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "TF-IDF Skorları "
    ]
   },
   {

processed_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97993e92396f03aa90162dad808bbd3c655a988b37d7ba45704b0058371c6172
+size 419458630