Spaces:

AUXteam
/

Scraper_hub

Running

App Files Files Community

itsOwen commited on Aug 20, 2024

Commit

ab80480

1 Parent(s): d2e413a

ollama experimental

Browse files

Files changed (24) hide show

.DS_Store +0 -0
.gitignore +0 -69
__init__.py +0 -0
app/__pycache__/streamlit_web_scraper_chat.cpython-312.pyc +0 -0
app/__pycache__/ui_components.cpython-312.pyc +0 -0
app/__pycache__/utils.cpython-312.pyc +0 -0
app/streamlit_web_scraper_chat.py +2 -1
chat_history.json +1 -0
main.py +64 -3
src/.DS_Store +0 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/models.cpython-312.pyc +0 -0
src/__pycache__/ollama_models.cpython-312.pyc +0 -0
src/__pycache__/web_extractor.cpython-312.pyc +0 -0
src/ollama_models.py +36 -0
src/scrapers/__pycache__/__init__.cpython-312.pyc +0 -0
src/scrapers/__pycache__/base_scraper.cpython-312.pyc +0 -0
src/scrapers/__pycache__/html_scraper.cpython-312.pyc +0 -0
src/scrapers/__pycache__/json_scraper.cpython-312.pyc +0 -0
src/scrapers/__pycache__/playwright_scraper.cpython-312.pyc +0 -0
src/utils/__pycache__/__init__.cpython-312.pyc +0 -0
src/utils/__pycache__/markdown_formatter.cpython-312.pyc +0 -0
src/utils/__pycache__/proxy_manager.cpython-312.pyc +0 -0
src/web_extractor.py +218 -44

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitignore DELETED Viewed

@@ -1,69 +0,0 @@
-# Python cache files
-__pycache__/
-*.py[cod]
-*$py.class
-# Virtual environment
-venv/
-# Streamlit cache
-.streamlit/
-# PyCharm files
-.idea/
-# VS Code files
-.vscode/
-# Jupyter Notebook
-.ipynb_checkpoints
-# Environment variables
-.env
-# Operating system files
-.DS_Store
-Thumbs.db
-# Log files
-*.log
-# Database files
-*.db
-*.sqlite3
-# Chat history (if you don't want to version control it)
-chat_history.json
-# Compiled Python files
-*.pyc
-# Package directories
-dist/
-build/
-*.egg-info/
-# Backup files
-*~
-*.bak
-# Coverage reports
-htmlcov/
-.coverage
-.coverage.*
-coverage.xml
-# Pytest cache
-.pytest_cache/
-# mypy cache
-.mypy_cache/
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-target/

__init__.py ADDED Viewed

File without changes

app/__pycache__/streamlit_web_scraper_chat.cpython-312.pyc ADDED Viewed

Binary file (1.03 kB). View file

app/__pycache__/ui_components.cpython-312.pyc ADDED Viewed

Binary file (8.39 kB). View file

app/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (1.86 kB). View file

app/streamlit_web_scraper_chat.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import asyncio
 from src.web_extractor import WebExtractor
 class StreamlitWebScraperChat:
-    def __init__(self, model_name: str = "gpt-4o-mini"):
         self.web_extractor = WebExtractor(model_name=model_name)
     def process_message(self, message: str) -> str:

 import asyncio
+import streamlit as st
 from src.web_extractor import WebExtractor
 class StreamlitWebScraperChat:
+    def __init__(self, model_name):
         self.web_extractor = WebExtractor(model_name=model_name)
     def process_message(self, message: str) -> str:

chat_history.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"1724123016.501483": {"messages": [{"role": "user", "content": "https://news.ycombinator.com/"}, {"role": "assistant", "content": "I've fetched and preprocessed the content from https://news.ycombinator.com/. What would you like to know about it?"}, {"role": "user", "content": "extract all the data on the website in form of csv"}, {"role": "assistant", "content": "```csv\ntitle,points,author,time_ago,comments,url\r\n13ft \u2013 A site similar to 12ft.io but self-hosted,299,darknavi,7 hours ago,141,github.com/wasi-master\r\nLaunch HN: Sorcerer (YC S24) \u2013 Weather balloons that collect more data,255,tndl,10 hours ago,120,N/A\r\nLet's Write a Reverb,73,notagoodidea,4 hours ago,8,signalsmith-audio.co.uk\r\nOn the cruelty of really teaching computing science (1988),53,torstenvl,3 hours ago,30,utexas.edu\r\n'Rare species' not seen in the area for 50 years spotted on Arizona trail camera,62,wglb,6 hours ago,8,phys.org\r\nLenticular Clock,84,animal_spirits,7 hours ago,17,instructables.com\r\nMusic recommendation system using transformer models,68,panarky,5 hours ago,32,research.google\r\nClassifying all of the pdfs on the internet,258,Nydhal,14 hours ago,91,snats.xyz\r\nInfisical (YC W23) Is Hiring Full Stack Engineer (Remote),N/A,N/A,2 hours ago,N/A,ycombinator.com\r\nMass Market DVDs Are Dead: Long Live Heritage Physical Media,13,throw0101d,3 hours ago,5,variety.com\r\nMigrating Mess with DNS to Use PowerDNS,92,hasheddan,9 hours ago,21,jvns.ca\r\nThe gigantic and unregulated power plants in the cloud,331,ahubert,11 hours ago,163,berthub.eu\r\nAsk HN: How do you work as a tech lead?,53,obvthroaway,3 hours ago,32,N/A\r\nCardinal \u2013 Virtual modular synthesizer plugin,68,iscream26,6 hours ago,15,kx.studio\r\n\"Spine \u2013 2D skeletal animation for games, with physical secondary motions\",64,diggan,9 hours ago,23,esotericsoftware.com\r\nNetboot.xyz: your favorite operating systems in one place,201,thunderbong,7 hours ago,43,netboot.xyz\r\nBit-banging a BASIC Birthday,111,debo_,15 hours ago,15,thingswemake.com\r\nPage Turns: Literary Translations on the American Ballet Theater's Summer Stage,6,bryanrasmussen,3 hours ago,N/A,lithub.com\r\nThe oral history of the Dinosaur Input Device,37,gelstudios,4 hours ago,2,vfxblog.com\r\nImgpls \u2013 An extension to load Imgur images directly,99,goranmoomin,8 hours ago,42,andadinosaur.com\r\nProcreate's anti-AI pledge attracts praise from digital creatives,26,TheCleric,1 hour ago,4,theverge.com\r\nSearching a Codebase in English,24,dakshgupta,5 hours ago,9,greptile.com\r\nAI companies are pivoting from creating gods to building products,98,randomwalker,5 hours ago,114,aisnakeoil.com\r\nBoeing Halts 777X Flight Tests over Damage Found in Engine Mount,17,toomuchtodo,2 hours ago,9,bloomberg.com\r\nThe guidance system and computer of the Minuteman III nuclear missile,132,magnat,7 hours ago,94,righto.com\r\nLLMs know more than what they say,75,nqnielsen,10 hours ago,11,arjunbansal.substack.com\r\nOptimizing global message transit latency: a journey through TCP configuration,53,amnonbc,12 hours ago,11,ably.com\r\nThe ribosome lowers the entropic penalty of protein folding,7,PaulHoule,1 hour ago,N/A,nature.com\r\nSeven basic rules for causal inference,191,RafelMri,20 hours ago,65,pedermisager.org\r\nParsing protobuf at 2+GB/s: how I learned to love tail calls in C (2021),333,fanf2,18 hours ago,140,reverberate.org\r\n\n```"}, {"role": "user", "content": "extract all teh important data in json"}, {"role": "assistant", "content": "```json\n[\n {\n \"title\": \"13ft \\u2013 A site similar to 12ft.io but self-hosted\",\n \"points\": 299,\n \"author\": \"darknavi\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 141\n },\n {\n \"title\": \"Launch HN: Sorcerer (YC S24) \\u2013 Weather balloons that collect more data\",\n \"points\": 255,\n \"author\": \"tndl\",\n \"time_ago\": \"10 hours ago\",\n \"comments\": 120\n },\n {\n \"title\": \"Let's Write a Reverb\",\n \"points\": 73,\n \"author\": \"notagoodidea\",\n \"time_ago\": \"4 hours ago\",\n \"comments\": 8\n },\n {\n \"title\": \"On the cruelty of really teaching computing science (1988)\",\n \"points\": 53,\n \"author\": \"torstenvl\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": 30\n },\n {\n \"title\": \"'Rare species' not seen in the area for 50 years spotted on Arizona trail camera\",\n \"points\": 62,\n \"author\": \"wglb\",\n \"time_ago\": \"6 hours ago\",\n \"comments\": 8\n },\n {\n \"title\": \"Lenticular Clock\",\n \"points\": 84,\n \"author\": \"animal_spirits\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 17\n },\n {\n \"title\": \"Music recommendation system using transformer models\",\n \"points\": 68,\n \"author\": \"panarky\",\n \"time_ago\": \"5 hours ago\",\n \"comments\": 32\n },\n {\n \"title\": \"Classifying all of the pdfs on the internet\",\n \"points\": 258,\n \"author\": \"Nydhal\",\n \"time_ago\": \"14 hours ago\",\n \"comments\": 91\n },\n {\n \"title\": \"Infisical (YC W23) Is Hiring Full Stack Engineer (Remote)\",\n \"points\": \"N/A\",\n \"author\": \"N/A\",\n \"time_ago\": \"2 hours ago\",\n \"comments\": \"N/A\"\n },\n {\n \"title\": \"Mass Market DVDs Are Dead: Long Live Heritage Physical Media\",\n \"points\": 13,\n \"author\": \"throw0101d\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": 5\n },\n {\n \"title\": \"Migrating Mess with DNS to Use PowerDNS\",\n \"points\": 92,\n \"author\": \"hasheddan\",\n \"time_ago\": \"9 hours ago\",\n \"comments\": 21\n },\n {\n \"title\": \"The gigantic and unregulated power plants in the cloud\",\n \"points\": 331,\n \"author\": \"ahubert\",\n \"time_ago\": \"11 hours ago\",\n \"comments\": 163\n },\n {\n \"title\": \"Ask HN: How do you work as a tech lead?\",\n \"points\": 53,\n \"author\": \"obvthroaway\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": 32\n },\n {\n \"title\": \"Cardinal \\u2013 Virtual modular synthesizer plugin\",\n \"points\": 68,\n \"author\": \"iscream26\",\n \"time_ago\": \"6 hours ago\",\n \"comments\": 15\n },\n {\n \"title\": \"Spine \\u2013 2D skeletal animation for games, with physical secondary motions\",\n \"points\": 64,\n \"author\": \"diggan\",\n \"time_ago\": \"9 hours ago\",\n \"comments\": 23\n },\n {\n \"title\": \"Netboot.xyz: your favorite operating systems in one place\",\n \"points\": 201,\n \"author\": \"thunderbong\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 43\n },\n {\n \"title\": \"Bit-banging a BASIC Birthday\",\n \"points\": 111,\n \"author\": \"debo_\",\n \"time_ago\": \"15 hours ago\",\n \"comments\": 15\n },\n {\n \"title\": \"Page Turns: Literary Translations on the American Ballet Theater's Summer Stage\",\n \"points\": 6,\n \"author\": \"bryanrasmussen\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": \"N/A\"\n },\n {\n \"title\": \"The oral history of the Dinosaur Input Device\",\n \"points\": 37,\n \"author\": \"gelstudios\",\n \"time_ago\": \"4 hours ago\",\n \"comments\": 2\n },\n {\n \"title\": \"Imgpls \\u2013 An extension to load Imgur images directly\",\n \"points\": 99,\n \"author\": \"goranmoomin\",\n \"time_ago\": \"8 hours ago\",\n \"comments\": 42\n },\n {\n \"title\": \"Procreate's anti-AI pledge attracts praise from digital creatives\",\n \"points\": 26,\n \"author\": \"TheCleric\",\n \"time_ago\": \"1 hour ago\",\n \"comments\": 4\n },\n {\n \"title\": \"Searching a Codebase in English\",\n \"points\": 24,\n \"author\": \"dakshgupta\",\n \"time_ago\": \"5 hours ago\",\n \"comments\": 9\n },\n {\n \"title\": \"AI companies are pivoting from creating gods to building products\",\n \"points\": 98,\n \"author\": \"randomwalker\",\n \"time_ago\": \"5 hours ago\",\n \"comments\": 114\n },\n {\n \"title\": \"Boeing Halts 777X Flight Tests over Damage Found in Engine Mount\",\n \"points\": 17,\n \"author\": \"toomuchtodo\",\n \"time_ago\": \"2 hours ago\",\n \"comments\": 9\n },\n {\n \"title\": \"The guidance system and computer of the Minuteman III nuclear missile\",\n \"points\": 132,\n \"author\": \"magnat\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 94\n },\n {\n \"title\": \"LLMs know more than what they say\",\n \"points\": 75,\n \"author\": \"nqnielsen\",\n \"time_ago\": \"10 hours ago\",\n \"comments\": 11\n },\n {\n \"title\": \"Optimizing global message transit latency: a journey through TCP configuration\",\n \"points\": 53,\n \"author\": \"amnonbc\",\n \"time_ago\": \"12 hours ago\",\n \"comments\": 11\n },\n {\n \"title\": \"The ribosome lowers the entropic penalty of protein folding\",\n \"points\": 7,\n \"author\": \"PaulHoule\",\n \"time_ago\": \"1 hour ago\",\n \"comments\": \"N/A\"\n },\n {\n \"title\": \"Seven basic rules for causal inference\",\n \"points\": 191,\n \"author\": \"RafelMri\",\n \"time_ago\": \"20 hours ago\",\n \"comments\": 65\n },\n {\n \"title\": \"Parsing protobuf at 2+GB/s: how I learned to love tail calls in C (2021)\",\n \"points\": 333,\n \"author\": \"fanf2\",\n \"time_ago\": \"18 hours ago\",\n \"comments\": 140\n }\n]\n```"}], "date": "2024-08-20"}}

main.py CHANGED Viewed

@@ -1,15 +1,28 @@
 import streamlit as st
 import json
 from app.streamlit_web_scraper_chat import StreamlitWebScraperChat
 from app.ui_components import display_info_icons, display_message
 from app.utils import loading_animation, get_loading_message
 from datetime import datetime, timedelta
 def safe_process_message(web_scraper_chat, message):
     if message is None or message.strip() == "":
         return "I'm sorry, but I didn't receive any input. Could you please try again?"
     try:
-        return web_scraper_chat.process_message(message)
     except AttributeError as e:
         if "'NoneType' object has no attribute 'lower'" in str(e):
             return "I encountered an issue while processing your request. It seems like I received an unexpected empty value. Could you please try rephrasing your input?"
@@ -48,12 +61,36 @@ def get_last_url_from_chat(messages):
     return None
 def initialize_web_scraper_chat(url=None):
-    web_scraper_chat = StreamlitWebScraperChat(model_name=st.session_state.selected_model)
     if url:
         web_scraper_chat.process_message(url)
     return web_scraper_chat
 def main():
     st.set_page_config(page_title="CyberScraper 2077", page_icon="🌐", layout="wide")
     hide_streamlit_style = """
@@ -157,6 +194,25 @@ def main():
     with st.sidebar:
         st.title("Conversation History")
         if st.button("+ 🗨️ New Chat", key="new_chat", use_container_width=True):
             new_chat_id = str(datetime.now().timestamp())
             st.session_state.chat_history[new_chat_id] = {
@@ -237,23 +293,28 @@ def main():
     prompt = st.chat_input("Enter the URL to scrape or ask a question regarding the data", key="user_input")
     if prompt:
         st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "user", "content": prompt})
         save_chat_history(st.session_state.chat_history)
         if not st.session_state.web_scraper_chat:
             st.session_state.web_scraper_chat = initialize_web_scraper_chat()
         with st.chat_message("assistant"):
             try:
                 full_response = loading_animation(
                     safe_process_message,
                     st.session_state.web_scraper_chat,
                     prompt
                 )
                 if full_response is not None:
-                    st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "assistant", "content": full_response})
                     save_chat_history(st.session_state.chat_history)
             except Exception as e:
                 st.error(f"An unexpected error occurred: {str(e)}")
         st.rerun()

 import streamlit as st
 import json
+import asyncio
+import logging
 from app.streamlit_web_scraper_chat import StreamlitWebScraperChat
 from app.ui_components import display_info_icons, display_message
 from app.utils import loading_animation, get_loading_message
 from datetime import datetime, timedelta
+from src.ollama_models import OllamaModel
+import pandas as pd
 def safe_process_message(web_scraper_chat, message):
     if message is None or message.strip() == "":
         return "I'm sorry, but I didn't receive any input. Could you please try again?"
     try:
+        response = web_scraper_chat.process_message(message)
+        if isinstance(response, tuple) and len(response) == 2 and isinstance(response[1], pd.DataFrame):
+            # This is a CSV response
+            csv_string, df = response
+            st.text("CSV Data:")
+            st.code(csv_string, language="csv")
+            st.text("Interactive Table:")
+            st.dataframe(df)
+            return csv_string  # Return only the string part for chat history
+        return response
     except AttributeError as e:
         if "'NoneType' object has no attribute 'lower'" in str(e):
             return "I encountered an issue while processing your request. It seems like I received an unexpected empty value. Could you please try rephrasing your input?"
     return None
 def initialize_web_scraper_chat(url=None):
+    if st.session_state.selected_model.startswith("ollama:"):
+        model = OllamaModel(st.session_state.selected_model[7:])
+    else:
+        model = st.session_state.selected_model
+    web_scraper_chat = StreamlitWebScraperChat(model_name=model)
+    if url:
+        web_scraper_chat.process_message(url)
+    return web_scraper_chat
+async def list_ollama_models():
+    try:
+        return await OllamaModel.list_models()
+    except Exception as e:
+        st.error(f"Error fetching Ollama models: {str(e)}")
+        return []
+def initialize_web_scraper_chat(url=None):
+    model_name = st.session_state.selected_model
+    web_scraper_chat = StreamlitWebScraperChat(model_name=model_name)
     if url:
         web_scraper_chat.process_message(url)
     return web_scraper_chat
 def main():
+    # Set up logging
+    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    logger = logging.getLogger(__name__)
+    logger.debug("Starting CyberScraper 2077")
+    # Set page config at the very beginning
     st.set_page_config(page_title="CyberScraper 2077", page_icon="🌐", layout="wide")
     hide_streamlit_style = """
     with st.sidebar:
         st.title("Conversation History")
+        # Model selection
+        st.subheader("Select Model")
+        default_models = ["gpt-4o-mini", "gpt-3.5-turbo"]
+        ollama_models = st.session_state.get('ollama_models', [])
+        all_models = default_models + [f"ollama:{model}" for model in ollama_models]
+        selected_model = st.selectbox("Choose a model", all_models, index=all_models.index(st.session_state.selected_model) if st.session_state.selected_model in all_models else 0)
+        if selected_model != st.session_state.selected_model:
+            st.session_state.selected_model = selected_model
+            st.session_state.web_scraper_chat = None
+            st.rerun()
+        if st.button("Refresh Ollama Models"):
+            with st.spinner("Fetching Ollama models..."):
+                st.session_state.ollama_models = asyncio.run(list_ollama_models())
+            st.success(f"Found {len(st.session_state.ollama_models)} Ollama models")
+            st.rerun()
         if st.button("+ 🗨️ New Chat", key="new_chat", use_container_width=True):
             new_chat_id = str(datetime.now().timestamp())
             st.session_state.chat_history[new_chat_id] = {
     prompt = st.chat_input("Enter the URL to scrape or ask a question regarding the data", key="user_input")
     if prompt:
+        logger.debug(f"Received prompt: {prompt}")
         st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "user", "content": prompt})
         save_chat_history(st.session_state.chat_history)
         if not st.session_state.web_scraper_chat:
+            logger.debug("Initializing web_scraper_chat")
             st.session_state.web_scraper_chat = initialize_web_scraper_chat()
         with st.chat_message("assistant"):
             try:
+                logger.debug("Processing message with web_scraper_chat")
                 full_response = loading_animation(
                     safe_process_message,
                     st.session_state.web_scraper_chat,
                     prompt
                 )
+                logger.debug(f"Received response (first 500 chars): {str(full_response)[:500]}...")
                 if full_response is not None:
+                    st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "assistant", "content": str(full_response)})
                     save_chat_history(st.session_state.chat_history)
             except Exception as e:
+                logger.error(f"An unexpected error occurred: {str(e)}")
                 st.error(f"An unexpected error occurred: {str(e)}")
         st.rerun()

src/.DS_Store CHANGED Viewed

Binary files a/src/.DS_Store and b/src/.DS_Store differ

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (155 Bytes). View file

src/__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (1 kB). View file

src/__pycache__/ollama_models.cpython-312.pyc ADDED Viewed

Binary file (3.18 kB). View file

src/__pycache__/web_extractor.cpython-312.pyc ADDED Viewed

Binary file (22 kB). View file

src/ollama_models.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import ollama
+from typing import List, Dict, Any
+import logging
+class OllamaModel:
+    def __init__(self, model_name: str):
+        self.model_name = model_name
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(logging.DEBUG)
+    async def generate(self, prompt: str, system_prompt: str = "") -> str:
+        self.logger.debug(f"Generating with Ollama model: {self.model_name}")
+        self.logger.debug(f"Prompt (first 500 chars): {prompt[:500]}...")
+        try:
+            response = ollama.generate(model=self.model_name, prompt=prompt, system=system_prompt)
+            self.logger.debug(f"Ollama response (first 500 chars): {response['response'][:500]}...")
+            return response['response']
+        except Exception as e:
+            self.logger.error(f"Error generating with Ollama: {str(e)}")
+            raise
+    @staticmethod
+    async def list_models() -> List[str]:
+        logger = logging.getLogger(__name__)
+        try:
+            models = ollama.list()
+            logger.debug(f"Available Ollama models: {models['models']}")
+            return [model['name'] for model in models['models']]
+        except Exception as e:
+            logger.error(f"Error listing Ollama models: {str(e)}")
+            return []
+class OllamaModelManager:
+    @staticmethod
+    def get_model(model_name: str) -> OllamaModel:
+        return OllamaModel(model_name)

src/scrapers/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (325 Bytes). View file

src/scrapers/__pycache__/base_scraper.cpython-312.pyc ADDED Viewed

Binary file (929 Bytes). View file

src/scrapers/__pycache__/html_scraper.cpython-312.pyc ADDED Viewed

Binary file (1.43 kB). View file

src/scrapers/__pycache__/json_scraper.cpython-312.pyc ADDED Viewed

Binary file (1.22 kB). View file

src/scrapers/__pycache__/playwright_scraper.cpython-312.pyc ADDED Viewed

Binary file (8.91 kB). View file

src/utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (161 Bytes). View file

src/utils/__pycache__/markdown_formatter.cpython-312.pyc ADDED Viewed

Binary file (1.01 kB). View file

src/utils/__pycache__/proxy_manager.cpython-312.pyc ADDED Viewed

Binary file (714 Bytes). View file

src/web_extractor.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import asyncio
-from typing import Dict, Any, Optional, List
 import json
 import pandas as pd
-from io import BytesIO
 import re
 from .models import Models
-from .scrapers import PlaywrightScraper, HTMLScraper, JSONScraper
 from .utils.proxy_manager import ProxyManager
 from .utils.markdown_formatter import MarkdownFormatter
 from langchain.prompts import PromptTemplate
@@ -13,11 +16,20 @@ from langchain.schema.runnable import RunnableSequence
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import tiktoken
 import time
 class WebExtractor:
     def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None):
         model_kwargs = model_kwargs or {}
-        self.model = Models.get_model(model_name, **model_kwargs)
         self.playwright_scraper = PlaywrightScraper()
         self.html_scraper = HTMLScraper()
         self.json_scraper = JSONScraper()
@@ -33,6 +45,8 @@ class WebExtractor:
             length_function=self.num_tokens_from_string,
         )
         self.max_tokens = 128000 if model_name == "gpt-4o-mini" else 16385
     @staticmethod
     def num_tokens_from_string(string: str) -> int:
@@ -60,33 +74,72 @@ class WebExtractor:
         return f"I've fetched and preprocessed the content from {self.current_url}. What would you like to know about it?"
     def _preprocess_content(self, content: str) -> str:
-        content = re.sub(r'<script\b[^>]*>[\s\S]*?</script>', '', content)
-        content = re.sub(r'<style\b[^>]*>[\s\S]*?</style>', '', content)
-        content = re.sub(r'<!--[\s\S]*?-->', '', content)
-        content = re.sub(r'<(?!/?(?:table|tr|th|td|thead|tbody|ul|ol|li|p|h[1-6]|br|hr)[>\s])\/?[^>]*>', '', content)
-        content = re.sub(r'\s+', ' ', content)
-        return content.strip()
     async def _extract_info(self, query: str) -> str:
         content_tokens = self.num_tokens_from_string(self.preprocessed_content)
         extraction_prompt = PromptTemplate(
             input_variables=["webpage_content", "query"],
             template="""You are an AI assistant that helps with web scraping tasks.
             Based on the following preprocessed webpage content and the user's request, extract the relevant information.
-            Present the data in a structured format as specified by the user's query:
-            - If the user asks for JSON, respond with a JSON array of objects.
-            - If the user asks for CSV, respond with CSV data (including headers).
-            - If the user asks for Excel, respond with data in a tabular format suitable for Excel.
-            - If the user asks for SQL, respond with a SQL table format including `CREATE TABLE` and `INSERT INTO` statements.
-            - If the user asks for HTML, respond with an HTML table format.
-            - If no format is specified, present the data as a list of dictionaries.
-            Include all requested fields, and if a field is not found, use "N/A" as the value.
             Do not invent or fabricate any data. If the information is not present, use "N/A".
             If the user specifies a number of entries to extract, limit your response to that number.
             If the user asks for all extractable data, provide all entries you can find.
             Ensure that the extracted data accurately reflects the content of the webpage.
             Preprocessed webpage content:
             {webpage_content}
@@ -95,65 +148,186 @@ class WebExtractor:
             AI: """
         )
         if content_tokens <= self.max_tokens - 1000:
-            chain = RunnableSequence(extraction_prompt | self.model)
-            response = await chain.ainvoke({"webpage_content": self.preprocessed_content, "query": query})
-            extracted_data = response.content
         else:
             chunks = self.optimized_text_splitter(self.preprocessed_content)
             all_extracted_data = []
-            for chunk in chunks:
-                chain = RunnableSequence(extraction_prompt | self.model)
-                response = await chain.ainvoke({"webpage_content": chunk, "query": query})
-                all_extracted_data.append(response.content)
-            extracted_data = "\n".join(all_extracted_data)
         if 'json' in query.lower():
             return self._format_as_json(extracted_data)
         elif 'csv' in query.lower():
-            return self._format_as_csv(extracted_data)
         elif 'excel' in query.lower():
             return self._format_as_excel_and_save(extracted_data)
         else:
             return self._format_as_text(extracted_data)
     def optimized_text_splitter(self, text: str) -> List[str]:
         return self.text_splitter.split_text(text)
-    def _format_as_json(self, data: str) -> str:
-        return data
-    def _format_as_csv(self, data: str) -> str:
-        return data
     def _format_as_excel_and_save(self, data: str) -> str:
         try:
-            lines = data.strip().split('\n')
-            rows = [line.split('|') for line in lines if line.strip()]
-            df = pd.DataFrame(rows[1:], columns=[col.strip() for col in rows[0]])
             output_filename = "output.xlsx"
             with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
                 df.to_excel(writer, index=False)
             return f"Excel data saved to {output_filename}"
         except Exception as e:
-            return f"Error: Unable to convert to Excel format. {str(e)}. Raw data: {data[:500]}..."
     def _format_as_text(self, data: str) -> str:
         try:
             parsed_data = json.loads(data)
-            return json.dumps(parsed_data, indent=2)
         except json.JSONDecodeError:
             return data
-    async def save_data(self, filename: str) -> str:
-        if not self.current_content:
-            return "No data to save. Please fetch a webpage first."
-        with open(filename, 'w', encoding='utf-8') as f:
-            f.write(self.current_content)
-        return f"Data saved to {filename}"
     def format_to_markdown(self, text: str) -> str:
         return self.markdown_formatter.to_markdown(text)
     def format_from_markdown(self, markdown_text: str) -> str:
-        return self.markdown_formatter.from_markdown(markdown_text)

 import asyncio
+from typing import Dict, Any, Optional, List, Tuple
 import json
 import pandas as pd
+from io import BytesIO, StringIO
 import re
 from .models import Models
+from .ollama_models import OllamaModel, OllamaModelManager
+from .scrapers.playwright_scraper import PlaywrightScraper
+from .scrapers.html_scraper import HTMLScraper
+from .scrapers.json_scraper import JSONScraper
 from .utils.proxy_manager import ProxyManager
 from .utils.markdown_formatter import MarkdownFormatter
 from langchain.prompts import PromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import tiktoken
 import time
+import logging
+import csv
+from bs4 import BeautifulSoup, Comment
 class WebExtractor:
     def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None):
         model_kwargs = model_kwargs or {}
+        if isinstance(model_name, str) and model_name.startswith("ollama:"):
+            self.model = OllamaModelManager.get_model(model_name[7:])
+        elif isinstance(model_name, OllamaModel):
+            self.model = model_name
+        else:
+            self.model = Models.get_model(model_name, **model_kwargs)
         self.playwright_scraper = PlaywrightScraper()
         self.html_scraper = HTMLScraper()
         self.json_scraper = JSONScraper()
             length_function=self.num_tokens_from_string,
         )
         self.max_tokens = 128000 if model_name == "gpt-4o-mini" else 16385
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(logging.DEBUG)
     @staticmethod
     def num_tokens_from_string(string: str) -> int:
         return f"I've fetched and preprocessed the content from {self.current_url}. What would you like to know about it?"
     def _preprocess_content(self, content: str) -> str:
+        soup = BeautifulSoup(content, 'html.parser')
+        for script in soup(["script", "style"]):
+            script.decompose()
+        for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        for tag in soup(["header", "footer", "nav", "aside"]):
+            tag.decompose()
+        for tag in soup.find_all():
+            if len(tag.get_text(strip=True)) == 0:
+                tag.extract()
+        text = soup.get_text()
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = '\n'.join(chunk for chunk in chunks if chunk)
+        return text
+    def _merge_json_chunks(self, chunks: List[str]) -> str:
+        merged_data = []
+        for chunk in chunks:
+            try:
+                data = json.loads(chunk)
+                if isinstance(data, list):
+                    merged_data.extend(data)
+                else:
+                    merged_data.append(data)
+            except json.JSONDecodeError:
+                self.logger.error(f"Failed to parse JSON chunk: {chunk[:100]}...")
+        return json.dumps(merged_data)
     async def _extract_info(self, query: str) -> str:
+        self.logger.debug(f"Extracting info with model: {self.model}")
         content_tokens = self.num_tokens_from_string(self.preprocessed_content)
         extraction_prompt = PromptTemplate(
             input_variables=["webpage_content", "query"],
             template="""You are an AI assistant that helps with web scraping tasks.
             Based on the following preprocessed webpage content and the user's request, extract the relevant information.
+            Always present the data as a JSON array of objects, regardless of the user's requested format.
+            Each object in the array should represent one item or row of data.
+            Use the following format without any unnecessary text, provide only the format and nothing else:
+            [
+            {{
+                "field1": "value1",
+                "field2": "value2"
+            }},
+            {{
+                "field1": "value1",
+                "field2": "value2"
+            }}
+            ]
+            If the user asks for information about the data on the webpage, explain about the data in bullet points and how can we use it, and provide further information if asked.
+            Include all requested fields. If a field is not found, use "N/A" as the value.
             Do not invent or fabricate any data. If the information is not present, use "N/A".
             If the user specifies a number of entries to extract, limit your response to that number.
             If the user asks for all extractable data, provide all entries you can find.
             Ensure that the extracted data accurately reflects the content of the webpage.
+            Use appropriate field names based on the webpage content and the user's query.
             Preprocessed webpage content:
             {webpage_content}
             AI: """
         )
+        self.logger.debug(f"Extraction prompt template: {extraction_prompt.template}")
+        self.logger.debug(f"Query: {query}")
         if content_tokens <= self.max_tokens - 1000:
+            if isinstance(self.model, OllamaModel):
+                self.logger.debug("Using OllamaModel for extraction")
+                full_prompt = extraction_prompt.format(webpage_content=self.preprocessed_content, query=query)
+                self.logger.debug(f"Full prompt for Ollama (first 500 chars): {full_prompt[:500]}...")
+                extracted_data = await self.model.generate(prompt=full_prompt)
+            else:
+                self.logger.debug("Using non-Ollama model for extraction")
+                chain = RunnableSequence(extraction_prompt | self.model)
+                response = await chain.ainvoke({"webpage_content": self.preprocessed_content, "query": query})
+                extracted_data = response.content
         else:
             chunks = self.optimized_text_splitter(self.preprocessed_content)
+            self.logger.debug(f"Content split into {len(chunks)} chunks")
             all_extracted_data = []
+            for i, chunk in enumerate(chunks):
+                if isinstance(self.model, OllamaModel):
+                    self.logger.debug(f"Processing chunk {i+1}/{len(chunks)} with OllamaModel")
+                    full_prompt = extraction_prompt.format(webpage_content=chunk, query=query)
+                    self.logger.debug(f"Full prompt for chunk {i+1} (first 500 chars): {full_prompt[:500]}...")
+                    chunk_data = await self.model.generate(prompt=full_prompt)
+                else:
+                    self.logger.debug(f"Processing chunk {i+1}/{len(chunks)} with non-Ollama model")
+                    chain = RunnableSequence(extraction_prompt | self.model)
+                    response = await chain.ainvoke({"webpage_content": chunk, "query": query})
+                    chunk_data = response.content
+                all_extracted_data.append(chunk_data)
+            extracted_data = self._merge_json_chunks(all_extracted_data)
+        self.logger.debug(f"Extracted data (first 500 chars): {extracted_data[:500]}...")
         if 'json' in query.lower():
             return self._format_as_json(extracted_data)
         elif 'csv' in query.lower():
+            csv_string, df = self._format_as_csv(extracted_data)
+            return f"```csv\n{csv_string}\n```", df
         elif 'excel' in query.lower():
             return self._format_as_excel_and_save(extracted_data)
+        elif 'sql' in query.lower():
+            return self._format_as_sql(extracted_data)
+        elif 'html' in query.lower():
+            return self._format_as_html(extracted_data)
         else:
             return self._format_as_text(extracted_data)
     def optimized_text_splitter(self, text: str) -> List[str]:
         return self.text_splitter.split_text(text)
+    def _format_as_sql(self, data: str) -> str:
+        json_pattern = r'```json\s*([\s\S]*?)\s*```'
+        match = re.search(json_pattern, data)
+        if match:
+            data = match.group(1)
+        try:
+            parsed_data = json.loads(data)
+            if not parsed_data:
+                return "No data to convert to SQL."
+            fields = ", ".join([f"{k} TEXT" for k in parsed_data[0].keys()])
+            sql = f"CREATE TABLE extracted_data ({fields});\n"
+            for row in parsed_data:
+                values = ", ".join([f"'{v}'" for v in row.values()])
+                sql += f"INSERT INTO extracted_data VALUES ({values});\n"
+            return f"```sql\n{sql}\n```"
+        except json.JSONDecodeError:
+            return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
+    def _format_as_html(self, data: str) -> str:
+        json_pattern = r'```json\s*([\s\S]*?)\s*```'
+        match = re.search(json_pattern, data)
+        if match:
+            data = match.group(1)
+        try:
+            parsed_data = json.loads(data)
+            if not parsed_data:
+                return "No data to convert to HTML."
+            # HTML Table Creation
+            html = "<table>\n<tr>\n"
+            html += "".join([f"<th>{k}</th>" for k in parsed_data[0].keys()])
+            html += "</tr>\n"
+            for row in parsed_data:
+                html += "<tr>\n"
+                html += "".join([f"<td>{v}</td>" for v in row.values()])
+                html += "</tr>\n"
+            html += "</table>"
+            return f"```html\n{html}\n```"
+        except json.JSONDecodeError:
+            return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
+    def _format_as_json(self, data: str) -> str:
+        json_pattern = r'```json\s*([\s\S]*?)\s*```'
+        match = re.search(json_pattern, data)
+        if match:
+            data = match.group(1)
+        try:
+            parsed_data = json.loads(data)
+            return f"```json\n{json.dumps(parsed_data, indent=2)}\n```"
+        except json.JSONDecodeError:
+            return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
     def _format_as_excel_and_save(self, data: str) -> str:
+        json_pattern = r'```json\s*([\s\S]*?)\s*```'
+        match = re.search(json_pattern, data)
+        if match:
+            data = match.group(1)
         try:
+            parsed_data = json.loads(data)
+            if not parsed_data:
+                return "No data to convert to Excel."
+            df = pd.DataFrame(parsed_data)
             output_filename = "output.xlsx"
             with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
                 df.to_excel(writer, index=False)
             return f"Excel data saved to {output_filename}"
+        except json.JSONDecodeError:
+            return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
         except Exception as e:
+            return f"Error: Failed to convert data to Excel. {str(e)}"
     def _format_as_text(self, data: str) -> str:
+        json_pattern = r'```json\s*([\s\S]*?)\s*```'
+        match = re.search(json_pattern, data)
+        if match:
+            data = match.group(1)
         try:
             parsed_data = json.loads(data)
+            return "\n".join([", ".join([f"{k}: {v}" for k, v in item.items()]) for item in parsed_data])
         except json.JSONDecodeError:
             return data
+    def _format_as_csv(self, data: str) -> Tuple[str, pd.DataFrame]:
+        json_pattern = r'```json\s*([\s\S]*?)\s*```'
+        match = re.search(json_pattern, data)
+        if match:
+            data = match.group(1)
+        else:
+            code_block_pattern = r'```\s*([\s\S]*?)\s*```'
+            match = re.search(code_block_pattern, data)
+            if match:
+                data = match.group(1)
+        try:
+            parsed_data = json.loads(data)
+            if not parsed_data:
+                return "No data to convert to CSV.", pd.DataFrame()
+            output = StringIO()
+            writer = csv.DictWriter(output, fieldnames=parsed_data[0].keys())
+            writer.writeheader()
+            writer.writerows(parsed_data)
+            csv_string = output.getvalue()
+            df = pd.DataFrame(parsed_data)
+            return csv_string, df
+        except json.JSONDecodeError as e:
+            self.logger.error(f"JSON Decode Error: {str(e)}")
+            error_msg = f"Error: Invalid JSON data. Raw data: {data[:500]}..."
+            return error_msg, pd.DataFrame()
+        except Exception as e:
+            self.logger.error(f"Unexpected error in _format_as_csv: {str(e)}")
+            error_msg = f"Error: Failed to convert data to CSV. {str(e)}"
+            return error_msg, pd.DataFrame()
     def format_to_markdown(self, text: str) -> str:
         return self.markdown_formatter.to_markdown(text)
     def format_from_markdown(self, markdown_text: str) -> str:
+        return self.markdown_formatter.from_markdown(markdown_text)
+    @staticmethod
+    async def list_ollama_models() -> List[str]:
+        return await OllamaModel.list_models()