Spaces:
Running
Running
itsOwen
commited on
Commit
·
ab80480
1
Parent(s):
d2e413a
ollama experimental
Browse files- .DS_Store +0 -0
- .gitignore +0 -69
- __init__.py +0 -0
- app/__pycache__/streamlit_web_scraper_chat.cpython-312.pyc +0 -0
- app/__pycache__/ui_components.cpython-312.pyc +0 -0
- app/__pycache__/utils.cpython-312.pyc +0 -0
- app/streamlit_web_scraper_chat.py +2 -1
- chat_history.json +1 -0
- main.py +64 -3
- src/.DS_Store +0 -0
- src/__pycache__/__init__.cpython-312.pyc +0 -0
- src/__pycache__/models.cpython-312.pyc +0 -0
- src/__pycache__/ollama_models.cpython-312.pyc +0 -0
- src/__pycache__/web_extractor.cpython-312.pyc +0 -0
- src/ollama_models.py +36 -0
- src/scrapers/__pycache__/__init__.cpython-312.pyc +0 -0
- src/scrapers/__pycache__/base_scraper.cpython-312.pyc +0 -0
- src/scrapers/__pycache__/html_scraper.cpython-312.pyc +0 -0
- src/scrapers/__pycache__/json_scraper.cpython-312.pyc +0 -0
- src/scrapers/__pycache__/playwright_scraper.cpython-312.pyc +0 -0
- src/utils/__pycache__/__init__.cpython-312.pyc +0 -0
- src/utils/__pycache__/markdown_formatter.cpython-312.pyc +0 -0
- src/utils/__pycache__/proxy_manager.cpython-312.pyc +0 -0
- src/web_extractor.py +218 -44
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.gitignore
DELETED
|
@@ -1,69 +0,0 @@
|
|
| 1 |
-
# Python cache files
|
| 2 |
-
__pycache__/
|
| 3 |
-
*.py[cod]
|
| 4 |
-
*$py.class
|
| 5 |
-
|
| 6 |
-
# Virtual environment
|
| 7 |
-
venv/
|
| 8 |
-
|
| 9 |
-
# Streamlit cache
|
| 10 |
-
.streamlit/
|
| 11 |
-
|
| 12 |
-
# PyCharm files
|
| 13 |
-
.idea/
|
| 14 |
-
|
| 15 |
-
# VS Code files
|
| 16 |
-
.vscode/
|
| 17 |
-
|
| 18 |
-
# Jupyter Notebook
|
| 19 |
-
.ipynb_checkpoints
|
| 20 |
-
|
| 21 |
-
# Environment variables
|
| 22 |
-
.env
|
| 23 |
-
|
| 24 |
-
# Operating system files
|
| 25 |
-
.DS_Store
|
| 26 |
-
Thumbs.db
|
| 27 |
-
|
| 28 |
-
# Log files
|
| 29 |
-
*.log
|
| 30 |
-
|
| 31 |
-
# Database files
|
| 32 |
-
*.db
|
| 33 |
-
*.sqlite3
|
| 34 |
-
|
| 35 |
-
# Chat history (if you don't want to version control it)
|
| 36 |
-
chat_history.json
|
| 37 |
-
|
| 38 |
-
# Compiled Python files
|
| 39 |
-
*.pyc
|
| 40 |
-
|
| 41 |
-
# Package directories
|
| 42 |
-
dist/
|
| 43 |
-
build/
|
| 44 |
-
*.egg-info/
|
| 45 |
-
|
| 46 |
-
# Backup files
|
| 47 |
-
*~
|
| 48 |
-
*.bak
|
| 49 |
-
|
| 50 |
-
# Coverage reports
|
| 51 |
-
htmlcov/
|
| 52 |
-
.coverage
|
| 53 |
-
.coverage.*
|
| 54 |
-
coverage.xml
|
| 55 |
-
|
| 56 |
-
# Pytest cache
|
| 57 |
-
.pytest_cache/
|
| 58 |
-
|
| 59 |
-
# mypy cache
|
| 60 |
-
.mypy_cache/
|
| 61 |
-
|
| 62 |
-
# Scrapy stuff:
|
| 63 |
-
.scrapy
|
| 64 |
-
|
| 65 |
-
# Sphinx documentation
|
| 66 |
-
docs/_build/
|
| 67 |
-
|
| 68 |
-
# PyBuilder
|
| 69 |
-
target/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__init__.py
ADDED
|
File without changes
|
app/__pycache__/streamlit_web_scraper_chat.cpython-312.pyc
ADDED
|
Binary file (1.03 kB). View file
|
|
|
app/__pycache__/ui_components.cpython-312.pyc
ADDED
|
Binary file (8.39 kB). View file
|
|
|
app/__pycache__/utils.cpython-312.pyc
ADDED
|
Binary file (1.86 kB). View file
|
|
|
app/streamlit_web_scraper_chat.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
import asyncio
|
|
|
|
| 2 |
from src.web_extractor import WebExtractor
|
| 3 |
|
| 4 |
class StreamlitWebScraperChat:
|
| 5 |
-
def __init__(self, model_name
|
| 6 |
self.web_extractor = WebExtractor(model_name=model_name)
|
| 7 |
|
| 8 |
def process_message(self, message: str) -> str:
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
import streamlit as st
|
| 3 |
from src.web_extractor import WebExtractor
|
| 4 |
|
| 5 |
class StreamlitWebScraperChat:
|
| 6 |
+
def __init__(self, model_name):
|
| 7 |
self.web_extractor = WebExtractor(model_name=model_name)
|
| 8 |
|
| 9 |
def process_message(self, message: str) -> str:
|
chat_history.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"1724123016.501483": {"messages": [{"role": "user", "content": "https://news.ycombinator.com/"}, {"role": "assistant", "content": "I've fetched and preprocessed the content from https://news.ycombinator.com/. What would you like to know about it?"}, {"role": "user", "content": "extract all the data on the website in form of csv"}, {"role": "assistant", "content": "```csv\ntitle,points,author,time_ago,comments,url\r\n13ft \u2013 A site similar to 12ft.io but self-hosted,299,darknavi,7 hours ago,141,github.com/wasi-master\r\nLaunch HN: Sorcerer (YC S24) \u2013 Weather balloons that collect more data,255,tndl,10 hours ago,120,N/A\r\nLet's Write a Reverb,73,notagoodidea,4 hours ago,8,signalsmith-audio.co.uk\r\nOn the cruelty of really teaching computing science (1988),53,torstenvl,3 hours ago,30,utexas.edu\r\n'Rare species' not seen in the area for 50 years spotted on Arizona trail camera,62,wglb,6 hours ago,8,phys.org\r\nLenticular Clock,84,animal_spirits,7 hours ago,17,instructables.com\r\nMusic recommendation system using transformer models,68,panarky,5 hours ago,32,research.google\r\nClassifying all of the pdfs on the internet,258,Nydhal,14 hours ago,91,snats.xyz\r\nInfisical (YC W23) Is Hiring Full Stack Engineer (Remote),N/A,N/A,2 hours ago,N/A,ycombinator.com\r\nMass Market DVDs Are Dead: Long Live Heritage Physical Media,13,throw0101d,3 hours ago,5,variety.com\r\nMigrating Mess with DNS to Use PowerDNS,92,hasheddan,9 hours ago,21,jvns.ca\r\nThe gigantic and unregulated power plants in the cloud,331,ahubert,11 hours ago,163,berthub.eu\r\nAsk HN: How do you work as a tech lead?,53,obvthroaway,3 hours ago,32,N/A\r\nCardinal \u2013 Virtual modular synthesizer plugin,68,iscream26,6 hours ago,15,kx.studio\r\n\"Spine \u2013 2D skeletal animation for games, with physical secondary motions\",64,diggan,9 hours ago,23,esotericsoftware.com\r\nNetboot.xyz: your favorite operating systems in one place,201,thunderbong,7 hours ago,43,netboot.xyz\r\nBit-banging a BASIC Birthday,111,debo_,15 hours ago,15,thingswemake.com\r\nPage Turns: Literary Translations on the American Ballet Theater's Summer Stage,6,bryanrasmussen,3 hours ago,N/A,lithub.com\r\nThe oral history of the Dinosaur Input Device,37,gelstudios,4 hours ago,2,vfxblog.com\r\nImgpls \u2013 An extension to load Imgur images directly,99,goranmoomin,8 hours ago,42,andadinosaur.com\r\nProcreate's anti-AI pledge attracts praise from digital creatives,26,TheCleric,1 hour ago,4,theverge.com\r\nSearching a Codebase in English,24,dakshgupta,5 hours ago,9,greptile.com\r\nAI companies are pivoting from creating gods to building products,98,randomwalker,5 hours ago,114,aisnakeoil.com\r\nBoeing Halts 777X Flight Tests over Damage Found in Engine Mount,17,toomuchtodo,2 hours ago,9,bloomberg.com\r\nThe guidance system and computer of the Minuteman III nuclear missile,132,magnat,7 hours ago,94,righto.com\r\nLLMs know more than what they say,75,nqnielsen,10 hours ago,11,arjunbansal.substack.com\r\nOptimizing global message transit latency: a journey through TCP configuration,53,amnonbc,12 hours ago,11,ably.com\r\nThe ribosome lowers the entropic penalty of protein folding,7,PaulHoule,1 hour ago,N/A,nature.com\r\nSeven basic rules for causal inference,191,RafelMri,20 hours ago,65,pedermisager.org\r\nParsing protobuf at 2+GB/s: how I learned to love tail calls in C (2021),333,fanf2,18 hours ago,140,reverberate.org\r\n\n```"}, {"role": "user", "content": "extract all teh important data in json"}, {"role": "assistant", "content": "```json\n[\n {\n \"title\": \"13ft \\u2013 A site similar to 12ft.io but self-hosted\",\n \"points\": 299,\n \"author\": \"darknavi\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 141\n },\n {\n \"title\": \"Launch HN: Sorcerer (YC S24) \\u2013 Weather balloons that collect more data\",\n \"points\": 255,\n \"author\": \"tndl\",\n \"time_ago\": \"10 hours ago\",\n \"comments\": 120\n },\n {\n \"title\": \"Let's Write a Reverb\",\n \"points\": 73,\n \"author\": \"notagoodidea\",\n \"time_ago\": \"4 hours ago\",\n \"comments\": 8\n },\n {\n \"title\": \"On the cruelty of really teaching computing science (1988)\",\n \"points\": 53,\n \"author\": \"torstenvl\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": 30\n },\n {\n \"title\": \"'Rare species' not seen in the area for 50 years spotted on Arizona trail camera\",\n \"points\": 62,\n \"author\": \"wglb\",\n \"time_ago\": \"6 hours ago\",\n \"comments\": 8\n },\n {\n \"title\": \"Lenticular Clock\",\n \"points\": 84,\n \"author\": \"animal_spirits\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 17\n },\n {\n \"title\": \"Music recommendation system using transformer models\",\n \"points\": 68,\n \"author\": \"panarky\",\n \"time_ago\": \"5 hours ago\",\n \"comments\": 32\n },\n {\n \"title\": \"Classifying all of the pdfs on the internet\",\n \"points\": 258,\n \"author\": \"Nydhal\",\n \"time_ago\": \"14 hours ago\",\n \"comments\": 91\n },\n {\n \"title\": \"Infisical (YC W23) Is Hiring Full Stack Engineer (Remote)\",\n \"points\": \"N/A\",\n \"author\": \"N/A\",\n \"time_ago\": \"2 hours ago\",\n \"comments\": \"N/A\"\n },\n {\n \"title\": \"Mass Market DVDs Are Dead: Long Live Heritage Physical Media\",\n \"points\": 13,\n \"author\": \"throw0101d\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": 5\n },\n {\n \"title\": \"Migrating Mess with DNS to Use PowerDNS\",\n \"points\": 92,\n \"author\": \"hasheddan\",\n \"time_ago\": \"9 hours ago\",\n \"comments\": 21\n },\n {\n \"title\": \"The gigantic and unregulated power plants in the cloud\",\n \"points\": 331,\n \"author\": \"ahubert\",\n \"time_ago\": \"11 hours ago\",\n \"comments\": 163\n },\n {\n \"title\": \"Ask HN: How do you work as a tech lead?\",\n \"points\": 53,\n \"author\": \"obvthroaway\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": 32\n },\n {\n \"title\": \"Cardinal \\u2013 Virtual modular synthesizer plugin\",\n \"points\": 68,\n \"author\": \"iscream26\",\n \"time_ago\": \"6 hours ago\",\n \"comments\": 15\n },\n {\n \"title\": \"Spine \\u2013 2D skeletal animation for games, with physical secondary motions\",\n \"points\": 64,\n \"author\": \"diggan\",\n \"time_ago\": \"9 hours ago\",\n \"comments\": 23\n },\n {\n \"title\": \"Netboot.xyz: your favorite operating systems in one place\",\n \"points\": 201,\n \"author\": \"thunderbong\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 43\n },\n {\n \"title\": \"Bit-banging a BASIC Birthday\",\n \"points\": 111,\n \"author\": \"debo_\",\n \"time_ago\": \"15 hours ago\",\n \"comments\": 15\n },\n {\n \"title\": \"Page Turns: Literary Translations on the American Ballet Theater's Summer Stage\",\n \"points\": 6,\n \"author\": \"bryanrasmussen\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": \"N/A\"\n },\n {\n \"title\": \"The oral history of the Dinosaur Input Device\",\n \"points\": 37,\n \"author\": \"gelstudios\",\n \"time_ago\": \"4 hours ago\",\n \"comments\": 2\n },\n {\n \"title\": \"Imgpls \\u2013 An extension to load Imgur images directly\",\n \"points\": 99,\n \"author\": \"goranmoomin\",\n \"time_ago\": \"8 hours ago\",\n \"comments\": 42\n },\n {\n \"title\": \"Procreate's anti-AI pledge attracts praise from digital creatives\",\n \"points\": 26,\n \"author\": \"TheCleric\",\n \"time_ago\": \"1 hour ago\",\n \"comments\": 4\n },\n {\n \"title\": \"Searching a Codebase in English\",\n \"points\": 24,\n \"author\": \"dakshgupta\",\n \"time_ago\": \"5 hours ago\",\n \"comments\": 9\n },\n {\n \"title\": \"AI companies are pivoting from creating gods to building products\",\n \"points\": 98,\n \"author\": \"randomwalker\",\n \"time_ago\": \"5 hours ago\",\n \"comments\": 114\n },\n {\n \"title\": \"Boeing Halts 777X Flight Tests over Damage Found in Engine Mount\",\n \"points\": 17,\n \"author\": \"toomuchtodo\",\n \"time_ago\": \"2 hours ago\",\n \"comments\": 9\n },\n {\n \"title\": \"The guidance system and computer of the Minuteman III nuclear missile\",\n \"points\": 132,\n \"author\": \"magnat\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 94\n },\n {\n \"title\": \"LLMs know more than what they say\",\n \"points\": 75,\n \"author\": \"nqnielsen\",\n \"time_ago\": \"10 hours ago\",\n \"comments\": 11\n },\n {\n \"title\": \"Optimizing global message transit latency: a journey through TCP configuration\",\n \"points\": 53,\n \"author\": \"amnonbc\",\n \"time_ago\": \"12 hours ago\",\n \"comments\": 11\n },\n {\n \"title\": \"The ribosome lowers the entropic penalty of protein folding\",\n \"points\": 7,\n \"author\": \"PaulHoule\",\n \"time_ago\": \"1 hour ago\",\n \"comments\": \"N/A\"\n },\n {\n \"title\": \"Seven basic rules for causal inference\",\n \"points\": 191,\n \"author\": \"RafelMri\",\n \"time_ago\": \"20 hours ago\",\n \"comments\": 65\n },\n {\n \"title\": \"Parsing protobuf at 2+GB/s: how I learned to love tail calls in C (2021)\",\n \"points\": 333,\n \"author\": \"fanf2\",\n \"time_ago\": \"18 hours ago\",\n \"comments\": 140\n }\n]\n```"}], "date": "2024-08-20"}}
|
main.py
CHANGED
|
@@ -1,15 +1,28 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import json
|
|
|
|
|
|
|
| 3 |
from app.streamlit_web_scraper_chat import StreamlitWebScraperChat
|
| 4 |
from app.ui_components import display_info_icons, display_message
|
| 5 |
from app.utils import loading_animation, get_loading_message
|
| 6 |
from datetime import datetime, timedelta
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def safe_process_message(web_scraper_chat, message):
|
| 9 |
if message is None or message.strip() == "":
|
| 10 |
return "I'm sorry, but I didn't receive any input. Could you please try again?"
|
| 11 |
try:
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
except AttributeError as e:
|
| 14 |
if "'NoneType' object has no attribute 'lower'" in str(e):
|
| 15 |
return "I encountered an issue while processing your request. It seems like I received an unexpected empty value. Could you please try rephrasing your input?"
|
|
@@ -48,12 +61,36 @@ def get_last_url_from_chat(messages):
|
|
| 48 |
return None
|
| 49 |
|
| 50 |
def initialize_web_scraper_chat(url=None):
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
if url:
|
| 53 |
web_scraper_chat.process_message(url)
|
| 54 |
return web_scraper_chat
|
| 55 |
|
| 56 |
def main():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
st.set_page_config(page_title="CyberScraper 2077", page_icon="🌐", layout="wide")
|
| 58 |
|
| 59 |
hide_streamlit_style = """
|
|
@@ -157,6 +194,25 @@ def main():
|
|
| 157 |
|
| 158 |
with st.sidebar:
|
| 159 |
st.title("Conversation History")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
if st.button("+ 🗨️ New Chat", key="new_chat", use_container_width=True):
|
| 161 |
new_chat_id = str(datetime.now().timestamp())
|
| 162 |
st.session_state.chat_history[new_chat_id] = {
|
|
@@ -237,23 +293,28 @@ def main():
|
|
| 237 |
prompt = st.chat_input("Enter the URL to scrape or ask a question regarding the data", key="user_input")
|
| 238 |
|
| 239 |
if prompt:
|
|
|
|
| 240 |
st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "user", "content": prompt})
|
| 241 |
save_chat_history(st.session_state.chat_history)
|
| 242 |
|
| 243 |
if not st.session_state.web_scraper_chat:
|
|
|
|
| 244 |
st.session_state.web_scraper_chat = initialize_web_scraper_chat()
|
| 245 |
|
| 246 |
with st.chat_message("assistant"):
|
| 247 |
try:
|
|
|
|
| 248 |
full_response = loading_animation(
|
| 249 |
safe_process_message,
|
| 250 |
st.session_state.web_scraper_chat,
|
| 251 |
prompt
|
| 252 |
)
|
|
|
|
| 253 |
if full_response is not None:
|
| 254 |
-
st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "assistant", "content": full_response})
|
| 255 |
save_chat_history(st.session_state.chat_history)
|
| 256 |
except Exception as e:
|
|
|
|
| 257 |
st.error(f"An unexpected error occurred: {str(e)}")
|
| 258 |
|
| 259 |
st.rerun()
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import json
|
| 3 |
+
import asyncio
|
| 4 |
+
import logging
|
| 5 |
from app.streamlit_web_scraper_chat import StreamlitWebScraperChat
|
| 6 |
from app.ui_components import display_info_icons, display_message
|
| 7 |
from app.utils import loading_animation, get_loading_message
|
| 8 |
from datetime import datetime, timedelta
|
| 9 |
+
from src.ollama_models import OllamaModel
|
| 10 |
+
import pandas as pd
|
| 11 |
|
| 12 |
def safe_process_message(web_scraper_chat, message):
|
| 13 |
if message is None or message.strip() == "":
|
| 14 |
return "I'm sorry, but I didn't receive any input. Could you please try again?"
|
| 15 |
try:
|
| 16 |
+
response = web_scraper_chat.process_message(message)
|
| 17 |
+
if isinstance(response, tuple) and len(response) == 2 and isinstance(response[1], pd.DataFrame):
|
| 18 |
+
# This is a CSV response
|
| 19 |
+
csv_string, df = response
|
| 20 |
+
st.text("CSV Data:")
|
| 21 |
+
st.code(csv_string, language="csv")
|
| 22 |
+
st.text("Interactive Table:")
|
| 23 |
+
st.dataframe(df)
|
| 24 |
+
return csv_string # Return only the string part for chat history
|
| 25 |
+
return response
|
| 26 |
except AttributeError as e:
|
| 27 |
if "'NoneType' object has no attribute 'lower'" in str(e):
|
| 28 |
return "I encountered an issue while processing your request. It seems like I received an unexpected empty value. Could you please try rephrasing your input?"
|
|
|
|
| 61 |
return None
|
| 62 |
|
| 63 |
def initialize_web_scraper_chat(url=None):
|
| 64 |
+
if st.session_state.selected_model.startswith("ollama:"):
|
| 65 |
+
model = OllamaModel(st.session_state.selected_model[7:])
|
| 66 |
+
else:
|
| 67 |
+
model = st.session_state.selected_model
|
| 68 |
+
web_scraper_chat = StreamlitWebScraperChat(model_name=model)
|
| 69 |
+
if url:
|
| 70 |
+
web_scraper_chat.process_message(url)
|
| 71 |
+
return web_scraper_chat
|
| 72 |
+
|
| 73 |
+
async def list_ollama_models():
|
| 74 |
+
try:
|
| 75 |
+
return await OllamaModel.list_models()
|
| 76 |
+
except Exception as e:
|
| 77 |
+
st.error(f"Error fetching Ollama models: {str(e)}")
|
| 78 |
+
return []
|
| 79 |
+
|
| 80 |
+
def initialize_web_scraper_chat(url=None):
|
| 81 |
+
model_name = st.session_state.selected_model
|
| 82 |
+
web_scraper_chat = StreamlitWebScraperChat(model_name=model_name)
|
| 83 |
if url:
|
| 84 |
web_scraper_chat.process_message(url)
|
| 85 |
return web_scraper_chat
|
| 86 |
|
| 87 |
def main():
|
| 88 |
+
# Set up logging
|
| 89 |
+
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 90 |
+
logger = logging.getLogger(__name__)
|
| 91 |
+
logger.debug("Starting CyberScraper 2077")
|
| 92 |
+
|
| 93 |
+
# Set page config at the very beginning
|
| 94 |
st.set_page_config(page_title="CyberScraper 2077", page_icon="🌐", layout="wide")
|
| 95 |
|
| 96 |
hide_streamlit_style = """
|
|
|
|
| 194 |
|
| 195 |
with st.sidebar:
|
| 196 |
st.title("Conversation History")
|
| 197 |
+
|
| 198 |
+
# Model selection
|
| 199 |
+
st.subheader("Select Model")
|
| 200 |
+
default_models = ["gpt-4o-mini", "gpt-3.5-turbo"]
|
| 201 |
+
ollama_models = st.session_state.get('ollama_models', [])
|
| 202 |
+
all_models = default_models + [f"ollama:{model}" for model in ollama_models]
|
| 203 |
+
selected_model = st.selectbox("Choose a model", all_models, index=all_models.index(st.session_state.selected_model) if st.session_state.selected_model in all_models else 0)
|
| 204 |
+
|
| 205 |
+
if selected_model != st.session_state.selected_model:
|
| 206 |
+
st.session_state.selected_model = selected_model
|
| 207 |
+
st.session_state.web_scraper_chat = None
|
| 208 |
+
st.rerun()
|
| 209 |
+
|
| 210 |
+
if st.button("Refresh Ollama Models"):
|
| 211 |
+
with st.spinner("Fetching Ollama models..."):
|
| 212 |
+
st.session_state.ollama_models = asyncio.run(list_ollama_models())
|
| 213 |
+
st.success(f"Found {len(st.session_state.ollama_models)} Ollama models")
|
| 214 |
+
st.rerun()
|
| 215 |
+
|
| 216 |
if st.button("+ 🗨️ New Chat", key="new_chat", use_container_width=True):
|
| 217 |
new_chat_id = str(datetime.now().timestamp())
|
| 218 |
st.session_state.chat_history[new_chat_id] = {
|
|
|
|
| 293 |
prompt = st.chat_input("Enter the URL to scrape or ask a question regarding the data", key="user_input")
|
| 294 |
|
| 295 |
if prompt:
|
| 296 |
+
logger.debug(f"Received prompt: {prompt}")
|
| 297 |
st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "user", "content": prompt})
|
| 298 |
save_chat_history(st.session_state.chat_history)
|
| 299 |
|
| 300 |
if not st.session_state.web_scraper_chat:
|
| 301 |
+
logger.debug("Initializing web_scraper_chat")
|
| 302 |
st.session_state.web_scraper_chat = initialize_web_scraper_chat()
|
| 303 |
|
| 304 |
with st.chat_message("assistant"):
|
| 305 |
try:
|
| 306 |
+
logger.debug("Processing message with web_scraper_chat")
|
| 307 |
full_response = loading_animation(
|
| 308 |
safe_process_message,
|
| 309 |
st.session_state.web_scraper_chat,
|
| 310 |
prompt
|
| 311 |
)
|
| 312 |
+
logger.debug(f"Received response (first 500 chars): {str(full_response)[:500]}...")
|
| 313 |
if full_response is not None:
|
| 314 |
+
st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "assistant", "content": str(full_response)})
|
| 315 |
save_chat_history(st.session_state.chat_history)
|
| 316 |
except Exception as e:
|
| 317 |
+
logger.error(f"An unexpected error occurred: {str(e)}")
|
| 318 |
st.error(f"An unexpected error occurred: {str(e)}")
|
| 319 |
|
| 320 |
st.rerun()
|
src/.DS_Store
CHANGED
|
Binary files a/src/.DS_Store and b/src/.DS_Store differ
|
|
|
src/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (155 Bytes). View file
|
|
|
src/__pycache__/models.cpython-312.pyc
ADDED
|
Binary file (1 kB). View file
|
|
|
src/__pycache__/ollama_models.cpython-312.pyc
ADDED
|
Binary file (3.18 kB). View file
|
|
|
src/__pycache__/web_extractor.cpython-312.pyc
ADDED
|
Binary file (22 kB). View file
|
|
|
src/ollama_models.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ollama
|
| 2 |
+
from typing import List, Dict, Any
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
class OllamaModel:
|
| 6 |
+
def __init__(self, model_name: str):
|
| 7 |
+
self.model_name = model_name
|
| 8 |
+
self.logger = logging.getLogger(__name__)
|
| 9 |
+
self.logger.setLevel(logging.DEBUG)
|
| 10 |
+
|
| 11 |
+
async def generate(self, prompt: str, system_prompt: str = "") -> str:
|
| 12 |
+
self.logger.debug(f"Generating with Ollama model: {self.model_name}")
|
| 13 |
+
self.logger.debug(f"Prompt (first 500 chars): {prompt[:500]}...")
|
| 14 |
+
try:
|
| 15 |
+
response = ollama.generate(model=self.model_name, prompt=prompt, system=system_prompt)
|
| 16 |
+
self.logger.debug(f"Ollama response (first 500 chars): {response['response'][:500]}...")
|
| 17 |
+
return response['response']
|
| 18 |
+
except Exception as e:
|
| 19 |
+
self.logger.error(f"Error generating with Ollama: {str(e)}")
|
| 20 |
+
raise
|
| 21 |
+
|
| 22 |
+
@staticmethod
|
| 23 |
+
async def list_models() -> List[str]:
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
try:
|
| 26 |
+
models = ollama.list()
|
| 27 |
+
logger.debug(f"Available Ollama models: {models['models']}")
|
| 28 |
+
return [model['name'] for model in models['models']]
|
| 29 |
+
except Exception as e:
|
| 30 |
+
logger.error(f"Error listing Ollama models: {str(e)}")
|
| 31 |
+
return []
|
| 32 |
+
|
| 33 |
+
class OllamaModelManager:
|
| 34 |
+
@staticmethod
|
| 35 |
+
def get_model(model_name: str) -> OllamaModel:
|
| 36 |
+
return OllamaModel(model_name)
|
src/scrapers/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (325 Bytes). View file
|
|
|
src/scrapers/__pycache__/base_scraper.cpython-312.pyc
ADDED
|
Binary file (929 Bytes). View file
|
|
|
src/scrapers/__pycache__/html_scraper.cpython-312.pyc
ADDED
|
Binary file (1.43 kB). View file
|
|
|
src/scrapers/__pycache__/json_scraper.cpython-312.pyc
ADDED
|
Binary file (1.22 kB). View file
|
|
|
src/scrapers/__pycache__/playwright_scraper.cpython-312.pyc
ADDED
|
Binary file (8.91 kB). View file
|
|
|
src/utils/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (161 Bytes). View file
|
|
|
src/utils/__pycache__/markdown_formatter.cpython-312.pyc
ADDED
|
Binary file (1.01 kB). View file
|
|
|
src/utils/__pycache__/proxy_manager.cpython-312.pyc
ADDED
|
Binary file (714 Bytes). View file
|
|
|
src/web_extractor.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
from typing import Dict, Any, Optional, List
|
| 3 |
import json
|
| 4 |
import pandas as pd
|
| 5 |
-
from io import BytesIO
|
| 6 |
import re
|
| 7 |
from .models import Models
|
| 8 |
-
from .
|
|
|
|
|
|
|
|
|
|
| 9 |
from .utils.proxy_manager import ProxyManager
|
| 10 |
from .utils.markdown_formatter import MarkdownFormatter
|
| 11 |
from langchain.prompts import PromptTemplate
|
|
@@ -13,11 +16,20 @@ from langchain.schema.runnable import RunnableSequence
|
|
| 13 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 14 |
import tiktoken
|
| 15 |
import time
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
class WebExtractor:
|
| 18 |
def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None):
|
| 19 |
model_kwargs = model_kwargs or {}
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
self.playwright_scraper = PlaywrightScraper()
|
| 22 |
self.html_scraper = HTMLScraper()
|
| 23 |
self.json_scraper = JSONScraper()
|
|
@@ -33,6 +45,8 @@ class WebExtractor:
|
|
| 33 |
length_function=self.num_tokens_from_string,
|
| 34 |
)
|
| 35 |
self.max_tokens = 128000 if model_name == "gpt-4o-mini" else 16385
|
|
|
|
|
|
|
| 36 |
|
| 37 |
@staticmethod
|
| 38 |
def num_tokens_from_string(string: str) -> int:
|
|
@@ -60,33 +74,72 @@ class WebExtractor:
|
|
| 60 |
return f"I've fetched and preprocessed the content from {self.current_url}. What would you like to know about it?"
|
| 61 |
|
| 62 |
def _preprocess_content(self, content: str) -> str:
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
async def _extract_info(self, query: str) -> str:
|
|
|
|
| 71 |
content_tokens = self.num_tokens_from_string(self.preprocessed_content)
|
| 72 |
|
| 73 |
extraction_prompt = PromptTemplate(
|
| 74 |
input_variables=["webpage_content", "query"],
|
| 75 |
template="""You are an AI assistant that helps with web scraping tasks.
|
| 76 |
Based on the following preprocessed webpage content and the user's request, extract the relevant information.
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
Do not invent or fabricate any data. If the information is not present, use "N/A".
|
| 87 |
If the user specifies a number of entries to extract, limit your response to that number.
|
| 88 |
If the user asks for all extractable data, provide all entries you can find.
|
| 89 |
Ensure that the extracted data accurately reflects the content of the webpage.
|
|
|
|
| 90 |
|
| 91 |
Preprocessed webpage content:
|
| 92 |
{webpage_content}
|
|
@@ -95,65 +148,186 @@ class WebExtractor:
|
|
| 95 |
AI: """
|
| 96 |
)
|
| 97 |
|
|
|
|
|
|
|
|
|
|
| 98 |
if content_tokens <= self.max_tokens - 1000:
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
else:
|
| 103 |
chunks = self.optimized_text_splitter(self.preprocessed_content)
|
|
|
|
| 104 |
all_extracted_data = []
|
| 105 |
-
for chunk in chunks:
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
if 'json' in query.lower():
|
| 112 |
return self._format_as_json(extracted_data)
|
| 113 |
elif 'csv' in query.lower():
|
| 114 |
-
|
|
|
|
| 115 |
elif 'excel' in query.lower():
|
| 116 |
return self._format_as_excel_and_save(extracted_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
else:
|
| 118 |
return self._format_as_text(extracted_data)
|
| 119 |
|
| 120 |
def optimized_text_splitter(self, text: str) -> List[str]:
|
| 121 |
return self.text_splitter.split_text(text)
|
| 122 |
|
| 123 |
-
def
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
-
def
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
def _format_as_excel_and_save(self, data: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
try:
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
| 134 |
output_filename = "output.xlsx"
|
| 135 |
with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
|
| 136 |
df.to_excel(writer, index=False)
|
| 137 |
return f"Excel data saved to {output_filename}"
|
|
|
|
|
|
|
| 138 |
except Exception as e:
|
| 139 |
-
return f"Error:
|
| 140 |
|
| 141 |
def _format_as_text(self, data: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
try:
|
| 143 |
parsed_data = json.loads(data)
|
| 144 |
-
return
|
| 145 |
except json.JSONDecodeError:
|
| 146 |
return data
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
def format_to_markdown(self, text: str) -> str:
|
| 156 |
return self.markdown_formatter.to_markdown(text)
|
| 157 |
|
| 158 |
def format_from_markdown(self, markdown_text: str) -> str:
|
| 159 |
-
return self.markdown_formatter.from_markdown(markdown_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
from typing import Dict, Any, Optional, List, Tuple
|
| 3 |
import json
|
| 4 |
import pandas as pd
|
| 5 |
+
from io import BytesIO, StringIO
|
| 6 |
import re
|
| 7 |
from .models import Models
|
| 8 |
+
from .ollama_models import OllamaModel, OllamaModelManager
|
| 9 |
+
from .scrapers.playwright_scraper import PlaywrightScraper
|
| 10 |
+
from .scrapers.html_scraper import HTMLScraper
|
| 11 |
+
from .scrapers.json_scraper import JSONScraper
|
| 12 |
from .utils.proxy_manager import ProxyManager
|
| 13 |
from .utils.markdown_formatter import MarkdownFormatter
|
| 14 |
from langchain.prompts import PromptTemplate
|
|
|
|
| 16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 17 |
import tiktoken
|
| 18 |
import time
|
| 19 |
+
import logging
|
| 20 |
+
import csv
|
| 21 |
+
from bs4 import BeautifulSoup, Comment
|
| 22 |
|
| 23 |
class WebExtractor:
|
| 24 |
def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None):
|
| 25 |
model_kwargs = model_kwargs or {}
|
| 26 |
+
if isinstance(model_name, str) and model_name.startswith("ollama:"):
|
| 27 |
+
self.model = OllamaModelManager.get_model(model_name[7:])
|
| 28 |
+
elif isinstance(model_name, OllamaModel):
|
| 29 |
+
self.model = model_name
|
| 30 |
+
else:
|
| 31 |
+
self.model = Models.get_model(model_name, **model_kwargs)
|
| 32 |
+
|
| 33 |
self.playwright_scraper = PlaywrightScraper()
|
| 34 |
self.html_scraper = HTMLScraper()
|
| 35 |
self.json_scraper = JSONScraper()
|
|
|
|
| 45 |
length_function=self.num_tokens_from_string,
|
| 46 |
)
|
| 47 |
self.max_tokens = 128000 if model_name == "gpt-4o-mini" else 16385
|
| 48 |
+
self.logger = logging.getLogger(__name__)
|
| 49 |
+
self.logger.setLevel(logging.DEBUG)
|
| 50 |
|
| 51 |
@staticmethod
|
| 52 |
def num_tokens_from_string(string: str) -> int:
|
|
|
|
| 74 |
return f"I've fetched and preprocessed the content from {self.current_url}. What would you like to know about it?"
|
| 75 |
|
| 76 |
def _preprocess_content(self, content: str) -> str:
|
| 77 |
+
soup = BeautifulSoup(content, 'html.parser')
|
| 78 |
+
|
| 79 |
+
for script in soup(["script", "style"]):
|
| 80 |
+
script.decompose()
|
| 81 |
+
|
| 82 |
+
for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
|
| 83 |
+
comment.extract()
|
| 84 |
+
|
| 85 |
+
for tag in soup(["header", "footer", "nav", "aside"]):
|
| 86 |
+
tag.decompose()
|
| 87 |
+
|
| 88 |
+
for tag in soup.find_all():
|
| 89 |
+
if len(tag.get_text(strip=True)) == 0:
|
| 90 |
+
tag.extract()
|
| 91 |
+
|
| 92 |
+
text = soup.get_text()
|
| 93 |
+
|
| 94 |
+
lines = (line.strip() for line in text.splitlines())
|
| 95 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
| 96 |
+
text = '\n'.join(chunk for chunk in chunks if chunk)
|
| 97 |
+
|
| 98 |
+
return text
|
| 99 |
+
|
| 100 |
+
def _merge_json_chunks(self, chunks: List[str]) -> str:
|
| 101 |
+
merged_data = []
|
| 102 |
+
for chunk in chunks:
|
| 103 |
+
try:
|
| 104 |
+
data = json.loads(chunk)
|
| 105 |
+
if isinstance(data, list):
|
| 106 |
+
merged_data.extend(data)
|
| 107 |
+
else:
|
| 108 |
+
merged_data.append(data)
|
| 109 |
+
except json.JSONDecodeError:
|
| 110 |
+
self.logger.error(f"Failed to parse JSON chunk: {chunk[:100]}...")
|
| 111 |
+
return json.dumps(merged_data)
|
| 112 |
|
| 113 |
async def _extract_info(self, query: str) -> str:
|
| 114 |
+
self.logger.debug(f"Extracting info with model: {self.model}")
|
| 115 |
content_tokens = self.num_tokens_from_string(self.preprocessed_content)
|
| 116 |
|
| 117 |
extraction_prompt = PromptTemplate(
|
| 118 |
input_variables=["webpage_content", "query"],
|
| 119 |
template="""You are an AI assistant that helps with web scraping tasks.
|
| 120 |
Based on the following preprocessed webpage content and the user's request, extract the relevant information.
|
| 121 |
+
Always present the data as a JSON array of objects, regardless of the user's requested format.
|
| 122 |
+
Each object in the array should represent one item or row of data.
|
| 123 |
+
Use the following format without any unnecessary text, provide only the format and nothing else:
|
| 124 |
+
|
| 125 |
+
[
|
| 126 |
+
{{
|
| 127 |
+
"field1": "value1",
|
| 128 |
+
"field2": "value2"
|
| 129 |
+
}},
|
| 130 |
+
{{
|
| 131 |
+
"field1": "value1",
|
| 132 |
+
"field2": "value2"
|
| 133 |
+
}}
|
| 134 |
+
]
|
| 135 |
+
|
| 136 |
+
If the user asks for information about the data on the webpage, explain about the data in bullet points and how can we use it, and provide further information if asked.
|
| 137 |
+
Include all requested fields. If a field is not found, use "N/A" as the value.
|
| 138 |
Do not invent or fabricate any data. If the information is not present, use "N/A".
|
| 139 |
If the user specifies a number of entries to extract, limit your response to that number.
|
| 140 |
If the user asks for all extractable data, provide all entries you can find.
|
| 141 |
Ensure that the extracted data accurately reflects the content of the webpage.
|
| 142 |
+
Use appropriate field names based on the webpage content and the user's query.
|
| 143 |
|
| 144 |
Preprocessed webpage content:
|
| 145 |
{webpage_content}
|
|
|
|
| 148 |
AI: """
|
| 149 |
)
|
| 150 |
|
| 151 |
+
self.logger.debug(f"Extraction prompt template: {extraction_prompt.template}")
|
| 152 |
+
self.logger.debug(f"Query: {query}")
|
| 153 |
+
|
| 154 |
if content_tokens <= self.max_tokens - 1000:
|
| 155 |
+
if isinstance(self.model, OllamaModel):
|
| 156 |
+
self.logger.debug("Using OllamaModel for extraction")
|
| 157 |
+
full_prompt = extraction_prompt.format(webpage_content=self.preprocessed_content, query=query)
|
| 158 |
+
self.logger.debug(f"Full prompt for Ollama (first 500 chars): {full_prompt[:500]}...")
|
| 159 |
+
extracted_data = await self.model.generate(prompt=full_prompt)
|
| 160 |
+
else:
|
| 161 |
+
self.logger.debug("Using non-Ollama model for extraction")
|
| 162 |
+
chain = RunnableSequence(extraction_prompt | self.model)
|
| 163 |
+
response = await chain.ainvoke({"webpage_content": self.preprocessed_content, "query": query})
|
| 164 |
+
extracted_data = response.content
|
| 165 |
else:
|
| 166 |
chunks = self.optimized_text_splitter(self.preprocessed_content)
|
| 167 |
+
self.logger.debug(f"Content split into {len(chunks)} chunks")
|
| 168 |
all_extracted_data = []
|
| 169 |
+
for i, chunk in enumerate(chunks):
|
| 170 |
+
if isinstance(self.model, OllamaModel):
|
| 171 |
+
self.logger.debug(f"Processing chunk {i+1}/{len(chunks)} with OllamaModel")
|
| 172 |
+
full_prompt = extraction_prompt.format(webpage_content=chunk, query=query)
|
| 173 |
+
self.logger.debug(f"Full prompt for chunk {i+1} (first 500 chars): {full_prompt[:500]}...")
|
| 174 |
+
chunk_data = await self.model.generate(prompt=full_prompt)
|
| 175 |
+
else:
|
| 176 |
+
self.logger.debug(f"Processing chunk {i+1}/{len(chunks)} with non-Ollama model")
|
| 177 |
+
chain = RunnableSequence(extraction_prompt | self.model)
|
| 178 |
+
response = await chain.ainvoke({"webpage_content": chunk, "query": query})
|
| 179 |
+
chunk_data = response.content
|
| 180 |
+
all_extracted_data.append(chunk_data)
|
| 181 |
+
extracted_data = self._merge_json_chunks(all_extracted_data)
|
| 182 |
+
|
| 183 |
+
self.logger.debug(f"Extracted data (first 500 chars): {extracted_data[:500]}...")
|
| 184 |
|
| 185 |
if 'json' in query.lower():
|
| 186 |
return self._format_as_json(extracted_data)
|
| 187 |
elif 'csv' in query.lower():
|
| 188 |
+
csv_string, df = self._format_as_csv(extracted_data)
|
| 189 |
+
return f"```csv\n{csv_string}\n```", df
|
| 190 |
elif 'excel' in query.lower():
|
| 191 |
return self._format_as_excel_and_save(extracted_data)
|
| 192 |
+
elif 'sql' in query.lower():
|
| 193 |
+
return self._format_as_sql(extracted_data)
|
| 194 |
+
elif 'html' in query.lower():
|
| 195 |
+
return self._format_as_html(extracted_data)
|
| 196 |
else:
|
| 197 |
return self._format_as_text(extracted_data)
|
| 198 |
|
| 199 |
def optimized_text_splitter(self, text: str) -> List[str]:
|
| 200 |
return self.text_splitter.split_text(text)
|
| 201 |
|
| 202 |
+
def _format_as_sql(self, data: str) -> str:
|
| 203 |
+
json_pattern = r'```json\s*([\s\S]*?)\s*```'
|
| 204 |
+
match = re.search(json_pattern, data)
|
| 205 |
+
if match:
|
| 206 |
+
data = match.group(1)
|
| 207 |
+
try:
|
| 208 |
+
parsed_data = json.loads(data)
|
| 209 |
+
if not parsed_data:
|
| 210 |
+
return "No data to convert to SQL."
|
| 211 |
+
|
| 212 |
+
fields = ", ".join([f"{k} TEXT" for k in parsed_data[0].keys()])
|
| 213 |
+
sql = f"CREATE TABLE extracted_data ({fields});\n"
|
| 214 |
+
|
| 215 |
+
for row in parsed_data:
|
| 216 |
+
values = ", ".join([f"'{v}'" for v in row.values()])
|
| 217 |
+
sql += f"INSERT INTO extracted_data VALUES ({values});\n"
|
| 218 |
+
|
| 219 |
+
return f"```sql\n{sql}\n```"
|
| 220 |
+
except json.JSONDecodeError:
|
| 221 |
+
return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
|
| 222 |
+
|
| 223 |
+
def _format_as_html(self, data: str) -> str:
|
| 224 |
+
json_pattern = r'```json\s*([\s\S]*?)\s*```'
|
| 225 |
+
match = re.search(json_pattern, data)
|
| 226 |
+
if match:
|
| 227 |
+
data = match.group(1)
|
| 228 |
+
try:
|
| 229 |
+
parsed_data = json.loads(data)
|
| 230 |
+
if not parsed_data:
|
| 231 |
+
return "No data to convert to HTML."
|
| 232 |
+
|
| 233 |
+
# HTML Table Creation
|
| 234 |
+
html = "<table>\n<tr>\n"
|
| 235 |
+
html += "".join([f"<th>{k}</th>" for k in parsed_data[0].keys()])
|
| 236 |
+
html += "</tr>\n"
|
| 237 |
+
|
| 238 |
+
for row in parsed_data:
|
| 239 |
+
html += "<tr>\n"
|
| 240 |
+
html += "".join([f"<td>{v}</td>" for v in row.values()])
|
| 241 |
+
html += "</tr>\n"
|
| 242 |
+
|
| 243 |
+
html += "</table>"
|
| 244 |
+
|
| 245 |
+
return f"```html\n{html}\n```"
|
| 246 |
+
except json.JSONDecodeError:
|
| 247 |
+
return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
|
| 248 |
|
| 249 |
+
def _format_as_json(self, data: str) -> str:
|
| 250 |
+
json_pattern = r'```json\s*([\s\S]*?)\s*```'
|
| 251 |
+
match = re.search(json_pattern, data)
|
| 252 |
+
if match:
|
| 253 |
+
data = match.group(1)
|
| 254 |
+
try:
|
| 255 |
+
parsed_data = json.loads(data)
|
| 256 |
+
return f"```json\n{json.dumps(parsed_data, indent=2)}\n```"
|
| 257 |
+
except json.JSONDecodeError:
|
| 258 |
+
return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
|
| 259 |
|
| 260 |
def _format_as_excel_and_save(self, data: str) -> str:
|
| 261 |
+
json_pattern = r'```json\s*([\s\S]*?)\s*```'
|
| 262 |
+
match = re.search(json_pattern, data)
|
| 263 |
+
if match:
|
| 264 |
+
data = match.group(1)
|
| 265 |
try:
|
| 266 |
+
parsed_data = json.loads(data)
|
| 267 |
+
if not parsed_data:
|
| 268 |
+
return "No data to convert to Excel."
|
| 269 |
+
|
| 270 |
+
df = pd.DataFrame(parsed_data)
|
| 271 |
output_filename = "output.xlsx"
|
| 272 |
with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
|
| 273 |
df.to_excel(writer, index=False)
|
| 274 |
return f"Excel data saved to {output_filename}"
|
| 275 |
+
except json.JSONDecodeError:
|
| 276 |
+
return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
|
| 277 |
except Exception as e:
|
| 278 |
+
return f"Error: Failed to convert data to Excel. {str(e)}"
|
| 279 |
|
| 280 |
def _format_as_text(self, data: str) -> str:
|
| 281 |
+
json_pattern = r'```json\s*([\s\S]*?)\s*```'
|
| 282 |
+
match = re.search(json_pattern, data)
|
| 283 |
+
if match:
|
| 284 |
+
data = match.group(1)
|
| 285 |
try:
|
| 286 |
parsed_data = json.loads(data)
|
| 287 |
+
return "\n".join([", ".join([f"{k}: {v}" for k, v in item.items()]) for item in parsed_data])
|
| 288 |
except json.JSONDecodeError:
|
| 289 |
return data
|
| 290 |
|
| 291 |
+
def _format_as_csv(self, data: str) -> Tuple[str, pd.DataFrame]:
|
| 292 |
+
json_pattern = r'```json\s*([\s\S]*?)\s*```'
|
| 293 |
+
match = re.search(json_pattern, data)
|
| 294 |
+
if match:
|
| 295 |
+
data = match.group(1)
|
| 296 |
+
else:
|
| 297 |
+
code_block_pattern = r'```\s*([\s\S]*?)\s*```'
|
| 298 |
+
match = re.search(code_block_pattern, data)
|
| 299 |
+
if match:
|
| 300 |
+
data = match.group(1)
|
| 301 |
+
|
| 302 |
+
try:
|
| 303 |
+
parsed_data = json.loads(data)
|
| 304 |
+
if not parsed_data:
|
| 305 |
+
return "No data to convert to CSV.", pd.DataFrame()
|
| 306 |
+
|
| 307 |
+
output = StringIO()
|
| 308 |
+
writer = csv.DictWriter(output, fieldnames=parsed_data[0].keys())
|
| 309 |
+
writer.writeheader()
|
| 310 |
+
writer.writerows(parsed_data)
|
| 311 |
+
csv_string = output.getvalue()
|
| 312 |
+
|
| 313 |
+
df = pd.DataFrame(parsed_data)
|
| 314 |
+
|
| 315 |
+
return csv_string, df
|
| 316 |
+
except json.JSONDecodeError as e:
|
| 317 |
+
self.logger.error(f"JSON Decode Error: {str(e)}")
|
| 318 |
+
error_msg = f"Error: Invalid JSON data. Raw data: {data[:500]}..."
|
| 319 |
+
return error_msg, pd.DataFrame()
|
| 320 |
+
except Exception as e:
|
| 321 |
+
self.logger.error(f"Unexpected error in _format_as_csv: {str(e)}")
|
| 322 |
+
error_msg = f"Error: Failed to convert data to CSV. {str(e)}"
|
| 323 |
+
return error_msg, pd.DataFrame()
|
| 324 |
|
| 325 |
def format_to_markdown(self, text: str) -> str:
|
| 326 |
return self.markdown_formatter.to_markdown(text)
|
| 327 |
|
| 328 |
def format_from_markdown(self, markdown_text: str) -> str:
|
| 329 |
+
return self.markdown_formatter.from_markdown(markdown_text)
|
| 330 |
+
|
| 331 |
+
@staticmethod
|
| 332 |
+
async def list_ollama_models() -> List[str]:
|
| 333 |
+
return await OllamaModel.list_models()
|