itsOwen commited on
Commit
ab80480
·
1 Parent(s): d2e413a

ollama experimental

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore DELETED
@@ -1,69 +0,0 @@
1
- # Python cache files
2
- __pycache__/
3
- *.py[cod]
4
- *$py.class
5
-
6
- # Virtual environment
7
- venv/
8
-
9
- # Streamlit cache
10
- .streamlit/
11
-
12
- # PyCharm files
13
- .idea/
14
-
15
- # VS Code files
16
- .vscode/
17
-
18
- # Jupyter Notebook
19
- .ipynb_checkpoints
20
-
21
- # Environment variables
22
- .env
23
-
24
- # Operating system files
25
- .DS_Store
26
- Thumbs.db
27
-
28
- # Log files
29
- *.log
30
-
31
- # Database files
32
- *.db
33
- *.sqlite3
34
-
35
- # Chat history (if you don't want to version control it)
36
- chat_history.json
37
-
38
- # Compiled Python files
39
- *.pyc
40
-
41
- # Package directories
42
- dist/
43
- build/
44
- *.egg-info/
45
-
46
- # Backup files
47
- *~
48
- *.bak
49
-
50
- # Coverage reports
51
- htmlcov/
52
- .coverage
53
- .coverage.*
54
- coverage.xml
55
-
56
- # Pytest cache
57
- .pytest_cache/
58
-
59
- # mypy cache
60
- .mypy_cache/
61
-
62
- # Scrapy stuff:
63
- .scrapy
64
-
65
- # Sphinx documentation
66
- docs/_build/
67
-
68
- # PyBuilder
69
- target/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
__init__.py ADDED
File without changes
app/__pycache__/streamlit_web_scraper_chat.cpython-312.pyc ADDED
Binary file (1.03 kB). View file
 
app/__pycache__/ui_components.cpython-312.pyc ADDED
Binary file (8.39 kB). View file
 
app/__pycache__/utils.cpython-312.pyc ADDED
Binary file (1.86 kB). View file
 
app/streamlit_web_scraper_chat.py CHANGED
@@ -1,8 +1,9 @@
1
  import asyncio
 
2
  from src.web_extractor import WebExtractor
3
 
4
  class StreamlitWebScraperChat:
5
- def __init__(self, model_name: str = "gpt-4o-mini"):
6
  self.web_extractor = WebExtractor(model_name=model_name)
7
 
8
  def process_message(self, message: str) -> str:
 
1
  import asyncio
2
+ import streamlit as st
3
  from src.web_extractor import WebExtractor
4
 
5
  class StreamlitWebScraperChat:
6
+ def __init__(self, model_name):
7
  self.web_extractor = WebExtractor(model_name=model_name)
8
 
9
  def process_message(self, message: str) -> str:
chat_history.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"1724123016.501483": {"messages": [{"role": "user", "content": "https://news.ycombinator.com/"}, {"role": "assistant", "content": "I've fetched and preprocessed the content from https://news.ycombinator.com/. What would you like to know about it?"}, {"role": "user", "content": "extract all the data on the website in form of csv"}, {"role": "assistant", "content": "```csv\ntitle,points,author,time_ago,comments,url\r\n13ft \u2013 A site similar to 12ft.io but self-hosted,299,darknavi,7 hours ago,141,github.com/wasi-master\r\nLaunch HN: Sorcerer (YC S24) \u2013 Weather balloons that collect more data,255,tndl,10 hours ago,120,N/A\r\nLet's Write a Reverb,73,notagoodidea,4 hours ago,8,signalsmith-audio.co.uk\r\nOn the cruelty of really teaching computing science (1988),53,torstenvl,3 hours ago,30,utexas.edu\r\n'Rare species' not seen in the area for 50 years spotted on Arizona trail camera,62,wglb,6 hours ago,8,phys.org\r\nLenticular Clock,84,animal_spirits,7 hours ago,17,instructables.com\r\nMusic recommendation system using transformer models,68,panarky,5 hours ago,32,research.google\r\nClassifying all of the pdfs on the internet,258,Nydhal,14 hours ago,91,snats.xyz\r\nInfisical (YC W23) Is Hiring Full Stack Engineer (Remote),N/A,N/A,2 hours ago,N/A,ycombinator.com\r\nMass Market DVDs Are Dead: Long Live Heritage Physical Media,13,throw0101d,3 hours ago,5,variety.com\r\nMigrating Mess with DNS to Use PowerDNS,92,hasheddan,9 hours ago,21,jvns.ca\r\nThe gigantic and unregulated power plants in the cloud,331,ahubert,11 hours ago,163,berthub.eu\r\nAsk HN: How do you work as a tech lead?,53,obvthroaway,3 hours ago,32,N/A\r\nCardinal \u2013 Virtual modular synthesizer plugin,68,iscream26,6 hours ago,15,kx.studio\r\n\"Spine \u2013 2D skeletal animation for games, with physical secondary motions\",64,diggan,9 hours ago,23,esotericsoftware.com\r\nNetboot.xyz: your favorite operating systems in one place,201,thunderbong,7 hours ago,43,netboot.xyz\r\nBit-banging a BASIC Birthday,111,debo_,15 hours ago,15,thingswemake.com\r\nPage Turns: Literary Translations on the American Ballet Theater's Summer Stage,6,bryanrasmussen,3 hours ago,N/A,lithub.com\r\nThe oral history of the Dinosaur Input Device,37,gelstudios,4 hours ago,2,vfxblog.com\r\nImgpls \u2013 An extension to load Imgur images directly,99,goranmoomin,8 hours ago,42,andadinosaur.com\r\nProcreate's anti-AI pledge attracts praise from digital creatives,26,TheCleric,1 hour ago,4,theverge.com\r\nSearching a Codebase in English,24,dakshgupta,5 hours ago,9,greptile.com\r\nAI companies are pivoting from creating gods to building products,98,randomwalker,5 hours ago,114,aisnakeoil.com\r\nBoeing Halts 777X Flight Tests over Damage Found in Engine Mount,17,toomuchtodo,2 hours ago,9,bloomberg.com\r\nThe guidance system and computer of the Minuteman III nuclear missile,132,magnat,7 hours ago,94,righto.com\r\nLLMs know more than what they say,75,nqnielsen,10 hours ago,11,arjunbansal.substack.com\r\nOptimizing global message transit latency: a journey through TCP configuration,53,amnonbc,12 hours ago,11,ably.com\r\nThe ribosome lowers the entropic penalty of protein folding,7,PaulHoule,1 hour ago,N/A,nature.com\r\nSeven basic rules for causal inference,191,RafelMri,20 hours ago,65,pedermisager.org\r\nParsing protobuf at 2+GB/s: how I learned to love tail calls in C (2021),333,fanf2,18 hours ago,140,reverberate.org\r\n\n```"}, {"role": "user", "content": "extract all teh important data in json"}, {"role": "assistant", "content": "```json\n[\n {\n \"title\": \"13ft \\u2013 A site similar to 12ft.io but self-hosted\",\n \"points\": 299,\n \"author\": \"darknavi\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 141\n },\n {\n \"title\": \"Launch HN: Sorcerer (YC S24) \\u2013 Weather balloons that collect more data\",\n \"points\": 255,\n \"author\": \"tndl\",\n \"time_ago\": \"10 hours ago\",\n \"comments\": 120\n },\n {\n \"title\": \"Let's Write a Reverb\",\n \"points\": 73,\n \"author\": \"notagoodidea\",\n \"time_ago\": \"4 hours ago\",\n \"comments\": 8\n },\n {\n \"title\": \"On the cruelty of really teaching computing science (1988)\",\n \"points\": 53,\n \"author\": \"torstenvl\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": 30\n },\n {\n \"title\": \"'Rare species' not seen in the area for 50 years spotted on Arizona trail camera\",\n \"points\": 62,\n \"author\": \"wglb\",\n \"time_ago\": \"6 hours ago\",\n \"comments\": 8\n },\n {\n \"title\": \"Lenticular Clock\",\n \"points\": 84,\n \"author\": \"animal_spirits\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 17\n },\n {\n \"title\": \"Music recommendation system using transformer models\",\n \"points\": 68,\n \"author\": \"panarky\",\n \"time_ago\": \"5 hours ago\",\n \"comments\": 32\n },\n {\n \"title\": \"Classifying all of the pdfs on the internet\",\n \"points\": 258,\n \"author\": \"Nydhal\",\n \"time_ago\": \"14 hours ago\",\n \"comments\": 91\n },\n {\n \"title\": \"Infisical (YC W23) Is Hiring Full Stack Engineer (Remote)\",\n \"points\": \"N/A\",\n \"author\": \"N/A\",\n \"time_ago\": \"2 hours ago\",\n \"comments\": \"N/A\"\n },\n {\n \"title\": \"Mass Market DVDs Are Dead: Long Live Heritage Physical Media\",\n \"points\": 13,\n \"author\": \"throw0101d\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": 5\n },\n {\n \"title\": \"Migrating Mess with DNS to Use PowerDNS\",\n \"points\": 92,\n \"author\": \"hasheddan\",\n \"time_ago\": \"9 hours ago\",\n \"comments\": 21\n },\n {\n \"title\": \"The gigantic and unregulated power plants in the cloud\",\n \"points\": 331,\n \"author\": \"ahubert\",\n \"time_ago\": \"11 hours ago\",\n \"comments\": 163\n },\n {\n \"title\": \"Ask HN: How do you work as a tech lead?\",\n \"points\": 53,\n \"author\": \"obvthroaway\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": 32\n },\n {\n \"title\": \"Cardinal \\u2013 Virtual modular synthesizer plugin\",\n \"points\": 68,\n \"author\": \"iscream26\",\n \"time_ago\": \"6 hours ago\",\n \"comments\": 15\n },\n {\n \"title\": \"Spine \\u2013 2D skeletal animation for games, with physical secondary motions\",\n \"points\": 64,\n \"author\": \"diggan\",\n \"time_ago\": \"9 hours ago\",\n \"comments\": 23\n },\n {\n \"title\": \"Netboot.xyz: your favorite operating systems in one place\",\n \"points\": 201,\n \"author\": \"thunderbong\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 43\n },\n {\n \"title\": \"Bit-banging a BASIC Birthday\",\n \"points\": 111,\n \"author\": \"debo_\",\n \"time_ago\": \"15 hours ago\",\n \"comments\": 15\n },\n {\n \"title\": \"Page Turns: Literary Translations on the American Ballet Theater's Summer Stage\",\n \"points\": 6,\n \"author\": \"bryanrasmussen\",\n \"time_ago\": \"3 hours ago\",\n \"comments\": \"N/A\"\n },\n {\n \"title\": \"The oral history of the Dinosaur Input Device\",\n \"points\": 37,\n \"author\": \"gelstudios\",\n \"time_ago\": \"4 hours ago\",\n \"comments\": 2\n },\n {\n \"title\": \"Imgpls \\u2013 An extension to load Imgur images directly\",\n \"points\": 99,\n \"author\": \"goranmoomin\",\n \"time_ago\": \"8 hours ago\",\n \"comments\": 42\n },\n {\n \"title\": \"Procreate's anti-AI pledge attracts praise from digital creatives\",\n \"points\": 26,\n \"author\": \"TheCleric\",\n \"time_ago\": \"1 hour ago\",\n \"comments\": 4\n },\n {\n \"title\": \"Searching a Codebase in English\",\n \"points\": 24,\n \"author\": \"dakshgupta\",\n \"time_ago\": \"5 hours ago\",\n \"comments\": 9\n },\n {\n \"title\": \"AI companies are pivoting from creating gods to building products\",\n \"points\": 98,\n \"author\": \"randomwalker\",\n \"time_ago\": \"5 hours ago\",\n \"comments\": 114\n },\n {\n \"title\": \"Boeing Halts 777X Flight Tests over Damage Found in Engine Mount\",\n \"points\": 17,\n \"author\": \"toomuchtodo\",\n \"time_ago\": \"2 hours ago\",\n \"comments\": 9\n },\n {\n \"title\": \"The guidance system and computer of the Minuteman III nuclear missile\",\n \"points\": 132,\n \"author\": \"magnat\",\n \"time_ago\": \"7 hours ago\",\n \"comments\": 94\n },\n {\n \"title\": \"LLMs know more than what they say\",\n \"points\": 75,\n \"author\": \"nqnielsen\",\n \"time_ago\": \"10 hours ago\",\n \"comments\": 11\n },\n {\n \"title\": \"Optimizing global message transit latency: a journey through TCP configuration\",\n \"points\": 53,\n \"author\": \"amnonbc\",\n \"time_ago\": \"12 hours ago\",\n \"comments\": 11\n },\n {\n \"title\": \"The ribosome lowers the entropic penalty of protein folding\",\n \"points\": 7,\n \"author\": \"PaulHoule\",\n \"time_ago\": \"1 hour ago\",\n \"comments\": \"N/A\"\n },\n {\n \"title\": \"Seven basic rules for causal inference\",\n \"points\": 191,\n \"author\": \"RafelMri\",\n \"time_ago\": \"20 hours ago\",\n \"comments\": 65\n },\n {\n \"title\": \"Parsing protobuf at 2+GB/s: how I learned to love tail calls in C (2021)\",\n \"points\": 333,\n \"author\": \"fanf2\",\n \"time_ago\": \"18 hours ago\",\n \"comments\": 140\n }\n]\n```"}], "date": "2024-08-20"}}
main.py CHANGED
@@ -1,15 +1,28 @@
1
  import streamlit as st
2
  import json
 
 
3
  from app.streamlit_web_scraper_chat import StreamlitWebScraperChat
4
  from app.ui_components import display_info_icons, display_message
5
  from app.utils import loading_animation, get_loading_message
6
  from datetime import datetime, timedelta
 
 
7
 
8
  def safe_process_message(web_scraper_chat, message):
9
  if message is None or message.strip() == "":
10
  return "I'm sorry, but I didn't receive any input. Could you please try again?"
11
  try:
12
- return web_scraper_chat.process_message(message)
 
 
 
 
 
 
 
 
 
13
  except AttributeError as e:
14
  if "'NoneType' object has no attribute 'lower'" in str(e):
15
  return "I encountered an issue while processing your request. It seems like I received an unexpected empty value. Could you please try rephrasing your input?"
@@ -48,12 +61,36 @@ def get_last_url_from_chat(messages):
48
  return None
49
 
50
  def initialize_web_scraper_chat(url=None):
51
- web_scraper_chat = StreamlitWebScraperChat(model_name=st.session_state.selected_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  if url:
53
  web_scraper_chat.process_message(url)
54
  return web_scraper_chat
55
 
56
  def main():
 
 
 
 
 
 
57
  st.set_page_config(page_title="CyberScraper 2077", page_icon="🌐", layout="wide")
58
 
59
  hide_streamlit_style = """
@@ -157,6 +194,25 @@ def main():
157
 
158
  with st.sidebar:
159
  st.title("Conversation History")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  if st.button("+ 🗨️ New Chat", key="new_chat", use_container_width=True):
161
  new_chat_id = str(datetime.now().timestamp())
162
  st.session_state.chat_history[new_chat_id] = {
@@ -237,23 +293,28 @@ def main():
237
  prompt = st.chat_input("Enter the URL to scrape or ask a question regarding the data", key="user_input")
238
 
239
  if prompt:
 
240
  st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "user", "content": prompt})
241
  save_chat_history(st.session_state.chat_history)
242
 
243
  if not st.session_state.web_scraper_chat:
 
244
  st.session_state.web_scraper_chat = initialize_web_scraper_chat()
245
 
246
  with st.chat_message("assistant"):
247
  try:
 
248
  full_response = loading_animation(
249
  safe_process_message,
250
  st.session_state.web_scraper_chat,
251
  prompt
252
  )
 
253
  if full_response is not None:
254
- st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "assistant", "content": full_response})
255
  save_chat_history(st.session_state.chat_history)
256
  except Exception as e:
 
257
  st.error(f"An unexpected error occurred: {str(e)}")
258
 
259
  st.rerun()
 
1
  import streamlit as st
2
  import json
3
+ import asyncio
4
+ import logging
5
  from app.streamlit_web_scraper_chat import StreamlitWebScraperChat
6
  from app.ui_components import display_info_icons, display_message
7
  from app.utils import loading_animation, get_loading_message
8
  from datetime import datetime, timedelta
9
+ from src.ollama_models import OllamaModel
10
+ import pandas as pd
11
 
12
  def safe_process_message(web_scraper_chat, message):
13
  if message is None or message.strip() == "":
14
  return "I'm sorry, but I didn't receive any input. Could you please try again?"
15
  try:
16
+ response = web_scraper_chat.process_message(message)
17
+ if isinstance(response, tuple) and len(response) == 2 and isinstance(response[1], pd.DataFrame):
18
+ # This is a CSV response
19
+ csv_string, df = response
20
+ st.text("CSV Data:")
21
+ st.code(csv_string, language="csv")
22
+ st.text("Interactive Table:")
23
+ st.dataframe(df)
24
+ return csv_string # Return only the string part for chat history
25
+ return response
26
  except AttributeError as e:
27
  if "'NoneType' object has no attribute 'lower'" in str(e):
28
  return "I encountered an issue while processing your request. It seems like I received an unexpected empty value. Could you please try rephrasing your input?"
 
61
  return None
62
 
63
  def initialize_web_scraper_chat(url=None):
64
+ if st.session_state.selected_model.startswith("ollama:"):
65
+ model = OllamaModel(st.session_state.selected_model[7:])
66
+ else:
67
+ model = st.session_state.selected_model
68
+ web_scraper_chat = StreamlitWebScraperChat(model_name=model)
69
+ if url:
70
+ web_scraper_chat.process_message(url)
71
+ return web_scraper_chat
72
+
73
+ async def list_ollama_models():
74
+ try:
75
+ return await OllamaModel.list_models()
76
+ except Exception as e:
77
+ st.error(f"Error fetching Ollama models: {str(e)}")
78
+ return []
79
+
80
+ def initialize_web_scraper_chat(url=None):
81
+ model_name = st.session_state.selected_model
82
+ web_scraper_chat = StreamlitWebScraperChat(model_name=model_name)
83
  if url:
84
  web_scraper_chat.process_message(url)
85
  return web_scraper_chat
86
 
87
  def main():
88
+ # Set up logging
89
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
90
+ logger = logging.getLogger(__name__)
91
+ logger.debug("Starting CyberScraper 2077")
92
+
93
+ # Set page config at the very beginning
94
  st.set_page_config(page_title="CyberScraper 2077", page_icon="🌐", layout="wide")
95
 
96
  hide_streamlit_style = """
 
194
 
195
  with st.sidebar:
196
  st.title("Conversation History")
197
+
198
+ # Model selection
199
+ st.subheader("Select Model")
200
+ default_models = ["gpt-4o-mini", "gpt-3.5-turbo"]
201
+ ollama_models = st.session_state.get('ollama_models', [])
202
+ all_models = default_models + [f"ollama:{model}" for model in ollama_models]
203
+ selected_model = st.selectbox("Choose a model", all_models, index=all_models.index(st.session_state.selected_model) if st.session_state.selected_model in all_models else 0)
204
+
205
+ if selected_model != st.session_state.selected_model:
206
+ st.session_state.selected_model = selected_model
207
+ st.session_state.web_scraper_chat = None
208
+ st.rerun()
209
+
210
+ if st.button("Refresh Ollama Models"):
211
+ with st.spinner("Fetching Ollama models..."):
212
+ st.session_state.ollama_models = asyncio.run(list_ollama_models())
213
+ st.success(f"Found {len(st.session_state.ollama_models)} Ollama models")
214
+ st.rerun()
215
+
216
  if st.button("+ 🗨️ New Chat", key="new_chat", use_container_width=True):
217
  new_chat_id = str(datetime.now().timestamp())
218
  st.session_state.chat_history[new_chat_id] = {
 
293
  prompt = st.chat_input("Enter the URL to scrape or ask a question regarding the data", key="user_input")
294
 
295
  if prompt:
296
+ logger.debug(f"Received prompt: {prompt}")
297
  st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "user", "content": prompt})
298
  save_chat_history(st.session_state.chat_history)
299
 
300
  if not st.session_state.web_scraper_chat:
301
+ logger.debug("Initializing web_scraper_chat")
302
  st.session_state.web_scraper_chat = initialize_web_scraper_chat()
303
 
304
  with st.chat_message("assistant"):
305
  try:
306
+ logger.debug("Processing message with web_scraper_chat")
307
  full_response = loading_animation(
308
  safe_process_message,
309
  st.session_state.web_scraper_chat,
310
  prompt
311
  )
312
+ logger.debug(f"Received response (first 500 chars): {str(full_response)[:500]}...")
313
  if full_response is not None:
314
+ st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "assistant", "content": str(full_response)})
315
  save_chat_history(st.session_state.chat_history)
316
  except Exception as e:
317
+ logger.error(f"An unexpected error occurred: {str(e)}")
318
  st.error(f"An unexpected error occurred: {str(e)}")
319
 
320
  st.rerun()
src/.DS_Store CHANGED
Binary files a/src/.DS_Store and b/src/.DS_Store differ
 
src/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (155 Bytes). View file
 
src/__pycache__/models.cpython-312.pyc ADDED
Binary file (1 kB). View file
 
src/__pycache__/ollama_models.cpython-312.pyc ADDED
Binary file (3.18 kB). View file
 
src/__pycache__/web_extractor.cpython-312.pyc ADDED
Binary file (22 kB). View file
 
src/ollama_models.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ollama
2
+ from typing import List, Dict, Any
3
+ import logging
4
+
5
+ class OllamaModel:
6
+ def __init__(self, model_name: str):
7
+ self.model_name = model_name
8
+ self.logger = logging.getLogger(__name__)
9
+ self.logger.setLevel(logging.DEBUG)
10
+
11
+ async def generate(self, prompt: str, system_prompt: str = "") -> str:
12
+ self.logger.debug(f"Generating with Ollama model: {self.model_name}")
13
+ self.logger.debug(f"Prompt (first 500 chars): {prompt[:500]}...")
14
+ try:
15
+ response = ollama.generate(model=self.model_name, prompt=prompt, system=system_prompt)
16
+ self.logger.debug(f"Ollama response (first 500 chars): {response['response'][:500]}...")
17
+ return response['response']
18
+ except Exception as e:
19
+ self.logger.error(f"Error generating with Ollama: {str(e)}")
20
+ raise
21
+
22
+ @staticmethod
23
+ async def list_models() -> List[str]:
24
+ logger = logging.getLogger(__name__)
25
+ try:
26
+ models = ollama.list()
27
+ logger.debug(f"Available Ollama models: {models['models']}")
28
+ return [model['name'] for model in models['models']]
29
+ except Exception as e:
30
+ logger.error(f"Error listing Ollama models: {str(e)}")
31
+ return []
32
+
33
+ class OllamaModelManager:
34
+ @staticmethod
35
+ def get_model(model_name: str) -> OllamaModel:
36
+ return OllamaModel(model_name)
src/scrapers/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (325 Bytes). View file
 
src/scrapers/__pycache__/base_scraper.cpython-312.pyc ADDED
Binary file (929 Bytes). View file
 
src/scrapers/__pycache__/html_scraper.cpython-312.pyc ADDED
Binary file (1.43 kB). View file
 
src/scrapers/__pycache__/json_scraper.cpython-312.pyc ADDED
Binary file (1.22 kB). View file
 
src/scrapers/__pycache__/playwright_scraper.cpython-312.pyc ADDED
Binary file (8.91 kB). View file
 
src/utils/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (161 Bytes). View file
 
src/utils/__pycache__/markdown_formatter.cpython-312.pyc ADDED
Binary file (1.01 kB). View file
 
src/utils/__pycache__/proxy_manager.cpython-312.pyc ADDED
Binary file (714 Bytes). View file
 
src/web_extractor.py CHANGED
@@ -1,11 +1,14 @@
1
  import asyncio
2
- from typing import Dict, Any, Optional, List
3
  import json
4
  import pandas as pd
5
- from io import BytesIO
6
  import re
7
  from .models import Models
8
- from .scrapers import PlaywrightScraper, HTMLScraper, JSONScraper
 
 
 
9
  from .utils.proxy_manager import ProxyManager
10
  from .utils.markdown_formatter import MarkdownFormatter
11
  from langchain.prompts import PromptTemplate
@@ -13,11 +16,20 @@ from langchain.schema.runnable import RunnableSequence
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
  import tiktoken
15
  import time
 
 
 
16
 
17
  class WebExtractor:
18
  def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None):
19
  model_kwargs = model_kwargs or {}
20
- self.model = Models.get_model(model_name, **model_kwargs)
 
 
 
 
 
 
21
  self.playwright_scraper = PlaywrightScraper()
22
  self.html_scraper = HTMLScraper()
23
  self.json_scraper = JSONScraper()
@@ -33,6 +45,8 @@ class WebExtractor:
33
  length_function=self.num_tokens_from_string,
34
  )
35
  self.max_tokens = 128000 if model_name == "gpt-4o-mini" else 16385
 
 
36
 
37
  @staticmethod
38
  def num_tokens_from_string(string: str) -> int:
@@ -60,33 +74,72 @@ class WebExtractor:
60
  return f"I've fetched and preprocessed the content from {self.current_url}. What would you like to know about it?"
61
 
62
  def _preprocess_content(self, content: str) -> str:
63
- content = re.sub(r'<script\b[^>]*>[\s\S]*?</script>', '', content)
64
- content = re.sub(r'<style\b[^>]*>[\s\S]*?</style>', '', content)
65
- content = re.sub(r'<!--[\s\S]*?-->', '', content)
66
- content = re.sub(r'<(?!/?(?:table|tr|th|td|thead|tbody|ul|ol|li|p|h[1-6]|br|hr)[>\s])\/?[^>]*>', '', content)
67
- content = re.sub(r'\s+', ' ', content)
68
- return content.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  async def _extract_info(self, query: str) -> str:
 
71
  content_tokens = self.num_tokens_from_string(self.preprocessed_content)
72
 
73
  extraction_prompt = PromptTemplate(
74
  input_variables=["webpage_content", "query"],
75
  template="""You are an AI assistant that helps with web scraping tasks.
76
  Based on the following preprocessed webpage content and the user's request, extract the relevant information.
77
- Present the data in a structured format as specified by the user's query:
78
- - If the user asks for JSON, respond with a JSON array of objects.
79
- - If the user asks for CSV, respond with CSV data (including headers).
80
- - If the user asks for Excel, respond with data in a tabular format suitable for Excel.
81
- - If the user asks for SQL, respond with a SQL table format including `CREATE TABLE` and `INSERT INTO` statements.
82
- - If the user asks for HTML, respond with an HTML table format.
83
- - If no format is specified, present the data as a list of dictionaries.
84
-
85
- Include all requested fields, and if a field is not found, use "N/A" as the value.
 
 
 
 
 
 
 
 
86
  Do not invent or fabricate any data. If the information is not present, use "N/A".
87
  If the user specifies a number of entries to extract, limit your response to that number.
88
  If the user asks for all extractable data, provide all entries you can find.
89
  Ensure that the extracted data accurately reflects the content of the webpage.
 
90
 
91
  Preprocessed webpage content:
92
  {webpage_content}
@@ -95,65 +148,186 @@ class WebExtractor:
95
  AI: """
96
  )
97
 
 
 
 
98
  if content_tokens <= self.max_tokens - 1000:
99
- chain = RunnableSequence(extraction_prompt | self.model)
100
- response = await chain.ainvoke({"webpage_content": self.preprocessed_content, "query": query})
101
- extracted_data = response.content
 
 
 
 
 
 
 
102
  else:
103
  chunks = self.optimized_text_splitter(self.preprocessed_content)
 
104
  all_extracted_data = []
105
- for chunk in chunks:
106
- chain = RunnableSequence(extraction_prompt | self.model)
107
- response = await chain.ainvoke({"webpage_content": chunk, "query": query})
108
- all_extracted_data.append(response.content)
109
- extracted_data = "\n".join(all_extracted_data)
 
 
 
 
 
 
 
 
 
 
110
 
111
  if 'json' in query.lower():
112
  return self._format_as_json(extracted_data)
113
  elif 'csv' in query.lower():
114
- return self._format_as_csv(extracted_data)
 
115
  elif 'excel' in query.lower():
116
  return self._format_as_excel_and_save(extracted_data)
 
 
 
 
117
  else:
118
  return self._format_as_text(extracted_data)
119
 
120
  def optimized_text_splitter(self, text: str) -> List[str]:
121
  return self.text_splitter.split_text(text)
122
 
123
- def _format_as_json(self, data: str) -> str:
124
- return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- def _format_as_csv(self, data: str) -> str:
127
- return data
 
 
 
 
 
 
 
 
128
 
129
  def _format_as_excel_and_save(self, data: str) -> str:
 
 
 
 
130
  try:
131
- lines = data.strip().split('\n')
132
- rows = [line.split('|') for line in lines if line.strip()]
133
- df = pd.DataFrame(rows[1:], columns=[col.strip() for col in rows[0]])
 
 
134
  output_filename = "output.xlsx"
135
  with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
136
  df.to_excel(writer, index=False)
137
  return f"Excel data saved to {output_filename}"
 
 
138
  except Exception as e:
139
- return f"Error: Unable to convert to Excel format. {str(e)}. Raw data: {data[:500]}..."
140
 
141
  def _format_as_text(self, data: str) -> str:
 
 
 
 
142
  try:
143
  parsed_data = json.loads(data)
144
- return json.dumps(parsed_data, indent=2)
145
  except json.JSONDecodeError:
146
  return data
147
 
148
- async def save_data(self, filename: str) -> str:
149
- if not self.current_content:
150
- return "No data to save. Please fetch a webpage first."
151
- with open(filename, 'w', encoding='utf-8') as f:
152
- f.write(self.current_content)
153
- return f"Data saved to {filename}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  def format_to_markdown(self, text: str) -> str:
156
  return self.markdown_formatter.to_markdown(text)
157
 
158
  def format_from_markdown(self, markdown_text: str) -> str:
159
- return self.markdown_formatter.from_markdown(markdown_text)
 
 
 
 
 
1
  import asyncio
2
+ from typing import Dict, Any, Optional, List, Tuple
3
  import json
4
  import pandas as pd
5
+ from io import BytesIO, StringIO
6
  import re
7
  from .models import Models
8
+ from .ollama_models import OllamaModel, OllamaModelManager
9
+ from .scrapers.playwright_scraper import PlaywrightScraper
10
+ from .scrapers.html_scraper import HTMLScraper
11
+ from .scrapers.json_scraper import JSONScraper
12
  from .utils.proxy_manager import ProxyManager
13
  from .utils.markdown_formatter import MarkdownFormatter
14
  from langchain.prompts import PromptTemplate
 
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
  import tiktoken
18
  import time
19
+ import logging
20
+ import csv
21
+ from bs4 import BeautifulSoup, Comment
22
 
23
  class WebExtractor:
24
  def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None):
25
  model_kwargs = model_kwargs or {}
26
+ if isinstance(model_name, str) and model_name.startswith("ollama:"):
27
+ self.model = OllamaModelManager.get_model(model_name[7:])
28
+ elif isinstance(model_name, OllamaModel):
29
+ self.model = model_name
30
+ else:
31
+ self.model = Models.get_model(model_name, **model_kwargs)
32
+
33
  self.playwright_scraper = PlaywrightScraper()
34
  self.html_scraper = HTMLScraper()
35
  self.json_scraper = JSONScraper()
 
45
  length_function=self.num_tokens_from_string,
46
  )
47
  self.max_tokens = 128000 if model_name == "gpt-4o-mini" else 16385
48
+ self.logger = logging.getLogger(__name__)
49
+ self.logger.setLevel(logging.DEBUG)
50
 
51
  @staticmethod
52
  def num_tokens_from_string(string: str) -> int:
 
74
  return f"I've fetched and preprocessed the content from {self.current_url}. What would you like to know about it?"
75
 
76
  def _preprocess_content(self, content: str) -> str:
77
+ soup = BeautifulSoup(content, 'html.parser')
78
+
79
+ for script in soup(["script", "style"]):
80
+ script.decompose()
81
+
82
+ for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
83
+ comment.extract()
84
+
85
+ for tag in soup(["header", "footer", "nav", "aside"]):
86
+ tag.decompose()
87
+
88
+ for tag in soup.find_all():
89
+ if len(tag.get_text(strip=True)) == 0:
90
+ tag.extract()
91
+
92
+ text = soup.get_text()
93
+
94
+ lines = (line.strip() for line in text.splitlines())
95
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
96
+ text = '\n'.join(chunk for chunk in chunks if chunk)
97
+
98
+ return text
99
+
100
+ def _merge_json_chunks(self, chunks: List[str]) -> str:
101
+ merged_data = []
102
+ for chunk in chunks:
103
+ try:
104
+ data = json.loads(chunk)
105
+ if isinstance(data, list):
106
+ merged_data.extend(data)
107
+ else:
108
+ merged_data.append(data)
109
+ except json.JSONDecodeError:
110
+ self.logger.error(f"Failed to parse JSON chunk: {chunk[:100]}...")
111
+ return json.dumps(merged_data)
112
 
113
  async def _extract_info(self, query: str) -> str:
114
+ self.logger.debug(f"Extracting info with model: {self.model}")
115
  content_tokens = self.num_tokens_from_string(self.preprocessed_content)
116
 
117
  extraction_prompt = PromptTemplate(
118
  input_variables=["webpage_content", "query"],
119
  template="""You are an AI assistant that helps with web scraping tasks.
120
  Based on the following preprocessed webpage content and the user's request, extract the relevant information.
121
+ Always present the data as a JSON array of objects, regardless of the user's requested format.
122
+ Each object in the array should represent one item or row of data.
123
+ Use the following format without any unnecessary text, provide only the format and nothing else:
124
+
125
+ [
126
+ {{
127
+ "field1": "value1",
128
+ "field2": "value2"
129
+ }},
130
+ {{
131
+ "field1": "value1",
132
+ "field2": "value2"
133
+ }}
134
+ ]
135
+
136
+ If the user asks for information about the data on the webpage, explain about the data in bullet points and how can we use it, and provide further information if asked.
137
+ Include all requested fields. If a field is not found, use "N/A" as the value.
138
  Do not invent or fabricate any data. If the information is not present, use "N/A".
139
  If the user specifies a number of entries to extract, limit your response to that number.
140
  If the user asks for all extractable data, provide all entries you can find.
141
  Ensure that the extracted data accurately reflects the content of the webpage.
142
+ Use appropriate field names based on the webpage content and the user's query.
143
 
144
  Preprocessed webpage content:
145
  {webpage_content}
 
148
  AI: """
149
  )
150
 
151
+ self.logger.debug(f"Extraction prompt template: {extraction_prompt.template}")
152
+ self.logger.debug(f"Query: {query}")
153
+
154
  if content_tokens <= self.max_tokens - 1000:
155
+ if isinstance(self.model, OllamaModel):
156
+ self.logger.debug("Using OllamaModel for extraction")
157
+ full_prompt = extraction_prompt.format(webpage_content=self.preprocessed_content, query=query)
158
+ self.logger.debug(f"Full prompt for Ollama (first 500 chars): {full_prompt[:500]}...")
159
+ extracted_data = await self.model.generate(prompt=full_prompt)
160
+ else:
161
+ self.logger.debug("Using non-Ollama model for extraction")
162
+ chain = RunnableSequence(extraction_prompt | self.model)
163
+ response = await chain.ainvoke({"webpage_content": self.preprocessed_content, "query": query})
164
+ extracted_data = response.content
165
  else:
166
  chunks = self.optimized_text_splitter(self.preprocessed_content)
167
+ self.logger.debug(f"Content split into {len(chunks)} chunks")
168
  all_extracted_data = []
169
+ for i, chunk in enumerate(chunks):
170
+ if isinstance(self.model, OllamaModel):
171
+ self.logger.debug(f"Processing chunk {i+1}/{len(chunks)} with OllamaModel")
172
+ full_prompt = extraction_prompt.format(webpage_content=chunk, query=query)
173
+ self.logger.debug(f"Full prompt for chunk {i+1} (first 500 chars): {full_prompt[:500]}...")
174
+ chunk_data = await self.model.generate(prompt=full_prompt)
175
+ else:
176
+ self.logger.debug(f"Processing chunk {i+1}/{len(chunks)} with non-Ollama model")
177
+ chain = RunnableSequence(extraction_prompt | self.model)
178
+ response = await chain.ainvoke({"webpage_content": chunk, "query": query})
179
+ chunk_data = response.content
180
+ all_extracted_data.append(chunk_data)
181
+ extracted_data = self._merge_json_chunks(all_extracted_data)
182
+
183
+ self.logger.debug(f"Extracted data (first 500 chars): {extracted_data[:500]}...")
184
 
185
  if 'json' in query.lower():
186
  return self._format_as_json(extracted_data)
187
  elif 'csv' in query.lower():
188
+ csv_string, df = self._format_as_csv(extracted_data)
189
+ return f"```csv\n{csv_string}\n```", df
190
  elif 'excel' in query.lower():
191
  return self._format_as_excel_and_save(extracted_data)
192
+ elif 'sql' in query.lower():
193
+ return self._format_as_sql(extracted_data)
194
+ elif 'html' in query.lower():
195
+ return self._format_as_html(extracted_data)
196
  else:
197
  return self._format_as_text(extracted_data)
198
 
199
  def optimized_text_splitter(self, text: str) -> List[str]:
200
  return self.text_splitter.split_text(text)
201
 
202
+ def _format_as_sql(self, data: str) -> str:
203
+ json_pattern = r'```json\s*([\s\S]*?)\s*```'
204
+ match = re.search(json_pattern, data)
205
+ if match:
206
+ data = match.group(1)
207
+ try:
208
+ parsed_data = json.loads(data)
209
+ if not parsed_data:
210
+ return "No data to convert to SQL."
211
+
212
+ fields = ", ".join([f"{k} TEXT" for k in parsed_data[0].keys()])
213
+ sql = f"CREATE TABLE extracted_data ({fields});\n"
214
+
215
+ for row in parsed_data:
216
+ values = ", ".join([f"'{v}'" for v in row.values()])
217
+ sql += f"INSERT INTO extracted_data VALUES ({values});\n"
218
+
219
+ return f"```sql\n{sql}\n```"
220
+ except json.JSONDecodeError:
221
+ return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
222
+
223
+ def _format_as_html(self, data: str) -> str:
224
+ json_pattern = r'```json\s*([\s\S]*?)\s*```'
225
+ match = re.search(json_pattern, data)
226
+ if match:
227
+ data = match.group(1)
228
+ try:
229
+ parsed_data = json.loads(data)
230
+ if not parsed_data:
231
+ return "No data to convert to HTML."
232
+
233
+ # HTML Table Creation
234
+ html = "<table>\n<tr>\n"
235
+ html += "".join([f"<th>{k}</th>" for k in parsed_data[0].keys()])
236
+ html += "</tr>\n"
237
+
238
+ for row in parsed_data:
239
+ html += "<tr>\n"
240
+ html += "".join([f"<td>{v}</td>" for v in row.values()])
241
+ html += "</tr>\n"
242
+
243
+ html += "</table>"
244
+
245
+ return f"```html\n{html}\n```"
246
+ except json.JSONDecodeError:
247
+ return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
248
 
249
+ def _format_as_json(self, data: str) -> str:
250
+ json_pattern = r'```json\s*([\s\S]*?)\s*```'
251
+ match = re.search(json_pattern, data)
252
+ if match:
253
+ data = match.group(1)
254
+ try:
255
+ parsed_data = json.loads(data)
256
+ return f"```json\n{json.dumps(parsed_data, indent=2)}\n```"
257
+ except json.JSONDecodeError:
258
+ return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
259
 
260
  def _format_as_excel_and_save(self, data: str) -> str:
261
+ json_pattern = r'```json\s*([\s\S]*?)\s*```'
262
+ match = re.search(json_pattern, data)
263
+ if match:
264
+ data = match.group(1)
265
  try:
266
+ parsed_data = json.loads(data)
267
+ if not parsed_data:
268
+ return "No data to convert to Excel."
269
+
270
+ df = pd.DataFrame(parsed_data)
271
  output_filename = "output.xlsx"
272
  with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
273
  df.to_excel(writer, index=False)
274
  return f"Excel data saved to {output_filename}"
275
+ except json.JSONDecodeError:
276
+ return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
277
  except Exception as e:
278
+ return f"Error: Failed to convert data to Excel. {str(e)}"
279
 
280
  def _format_as_text(self, data: str) -> str:
281
+ json_pattern = r'```json\s*([\s\S]*?)\s*```'
282
+ match = re.search(json_pattern, data)
283
+ if match:
284
+ data = match.group(1)
285
  try:
286
  parsed_data = json.loads(data)
287
+ return "\n".join([", ".join([f"{k}: {v}" for k, v in item.items()]) for item in parsed_data])
288
  except json.JSONDecodeError:
289
  return data
290
 
291
+ def _format_as_csv(self, data: str) -> Tuple[str, pd.DataFrame]:
292
+ json_pattern = r'```json\s*([\s\S]*?)\s*```'
293
+ match = re.search(json_pattern, data)
294
+ if match:
295
+ data = match.group(1)
296
+ else:
297
+ code_block_pattern = r'```\s*([\s\S]*?)\s*```'
298
+ match = re.search(code_block_pattern, data)
299
+ if match:
300
+ data = match.group(1)
301
+
302
+ try:
303
+ parsed_data = json.loads(data)
304
+ if not parsed_data:
305
+ return "No data to convert to CSV.", pd.DataFrame()
306
+
307
+ output = StringIO()
308
+ writer = csv.DictWriter(output, fieldnames=parsed_data[0].keys())
309
+ writer.writeheader()
310
+ writer.writerows(parsed_data)
311
+ csv_string = output.getvalue()
312
+
313
+ df = pd.DataFrame(parsed_data)
314
+
315
+ return csv_string, df
316
+ except json.JSONDecodeError as e:
317
+ self.logger.error(f"JSON Decode Error: {str(e)}")
318
+ error_msg = f"Error: Invalid JSON data. Raw data: {data[:500]}..."
319
+ return error_msg, pd.DataFrame()
320
+ except Exception as e:
321
+ self.logger.error(f"Unexpected error in _format_as_csv: {str(e)}")
322
+ error_msg = f"Error: Failed to convert data to CSV. {str(e)}"
323
+ return error_msg, pd.DataFrame()
324
 
325
  def format_to_markdown(self, text: str) -> str:
326
  return self.markdown_formatter.to_markdown(text)
327
 
328
  def format_from_markdown(self, markdown_text: str) -> str:
329
+ return self.markdown_formatter.from_markdown(markdown_text)
330
+
331
+ @staticmethod
332
+ async def list_ollama_models() -> List[str]:
333
+ return await OllamaModel.list_models()