Spaces:
Running
Running
google-labs-jules[bot]
Greene-ctrl
commited on
Commit
·
e1d311a
1
Parent(s):
0526857
Deploy CyberScraper 2077 to Hugging Face with Blablador LLM support
Browse filesSummary of changes:
- Added FastAPI `api.py` with `/health` and `/api/scrape` endpoints.
- Configured Nginx reverse proxy to handle UI and API on port 7860.
- Implemented Blablador LLM provider with `alias-fast` and `alias-large` models.
- Set `alias-fast` as the default model.
- Updated `Dockerfile` for Hugging Face Spaces (non-root user, `uv` manager).
- Created a GitHub Action for automatic synchronization to Hugging Face Hub.
- Updated README with Space metadata and .hfignore for cleaner deployment.
- Verified deployment and functionality on the live Space.
Co-authored-by: Greene-ctrl <192867433+Greene-ctrl@users.noreply.github.com>
- .github/workflows/sync_to_hf.yml +19 -0
- .hfignore +9 -0
- Dockerfile +46 -66
- README.md +9 -0
- api.py +48 -0
- main.py +7 -2
- nginx.conf +51 -0
- src/models.py +7 -0
- src/utils/error_handler.py +12 -0
- start.sh +20 -0
- test_extractor.py +21 -0
- test_patchright.py +13 -0
.github/workflows/sync_to_hf.yml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main, master, jules-*]
|
| 5 |
+
workflow_dispatch:
|
| 6 |
+
|
| 7 |
+
jobs:
|
| 8 |
+
sync-to-hub:
|
| 9 |
+
runs-on: ubuntu-latest
|
| 10 |
+
steps:
|
| 11 |
+
- uses: actions/checkout@v3
|
| 12 |
+
with:
|
| 13 |
+
fetch-depth: 0
|
| 14 |
+
lfs: true
|
| 15 |
+
- name: Push to hub
|
| 16 |
+
env:
|
| 17 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 18 |
+
run: |
|
| 19 |
+
git push --force https://AUXteam:${HF_TOKEN}@huggingface.co/spaces/AUXteam/Scraper_hub HEAD:main
|
.hfignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git/
|
| 2 |
+
.github/
|
| 3 |
+
venv/
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.pyc
|
| 6 |
+
.env
|
| 7 |
+
chat_history.json
|
| 8 |
+
test_patchright.py
|
| 9 |
+
client_secret.json
|
Dockerfile
CHANGED
|
@@ -1,10 +1,16 @@
|
|
| 1 |
# Use Python 3.12 for better performance and compatibility
|
| 2 |
FROM python:3.12-slim-bookworm
|
| 3 |
|
| 4 |
-
# Set
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
# Install system dependencies
|
| 8 |
RUN apt-get update && apt-get install -y \
|
| 9 |
wget \
|
| 10 |
gnupg \
|
|
@@ -17,6 +23,7 @@ RUN apt-get update && apt-get install -y \
|
|
| 17 |
python3-dev \
|
| 18 |
libffi-dev \
|
| 19 |
procps \
|
|
|
|
| 20 |
# Browser dependencies for Playwright/Patchright
|
| 21 |
libglib2.0-0 \
|
| 22 |
libnspr4 \
|
|
@@ -38,77 +45,50 @@ RUN apt-get update && apt-get install -y \
|
|
| 38 |
&& apt-get clean \
|
| 39 |
&& rm -rf /var/lib/apt/lists/*
|
| 40 |
|
| 41 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
RUN echo "SocksPort 9050" >> /etc/tor/torrc && \
|
| 43 |
echo "ControlPort 9051" >> /etc/tor/torrc && \
|
| 44 |
echo "CookieAuthentication 1" >> /etc/tor/torrc && \
|
| 45 |
echo "DataDirectory /var/lib/tor" >> /etc/tor/torrc
|
| 46 |
|
| 47 |
-
# Set
|
| 48 |
-
RUN
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
| 53 |
|
| 54 |
-
#
|
| 55 |
-
|
| 56 |
-
ENV PATH="/app/venv/bin:$PATH"
|
| 57 |
-
|
| 58 |
-
# Install Python dependencies (includes PySocks for Tor support)
|
| 59 |
-
# Added retries and timeout for network reliability
|
| 60 |
-
RUN pip install --no-cache-dir --timeout=120 --retries=3 -r requirements.txt
|
| 61 |
-
|
| 62 |
-
# Install patchright browser (chrome not available on ARM64)
|
| 63 |
-
RUN patchright install chromium
|
| 64 |
|
| 65 |
-
#
|
| 66 |
-
RUN
|
| 67 |
-
\n\
|
| 68 |
-
# Start Tor service\n\
|
| 69 |
-
echo "Starting Tor service..."\n\
|
| 70 |
-
service tor start\n\
|
| 71 |
-
\n\
|
| 72 |
-
# Wait for Tor to be ready\n\
|
| 73 |
-
echo "Waiting for Tor to start..."\n\
|
| 74 |
-
for i in {1..30}; do\n\
|
| 75 |
-
if ps aux | grep -v grep | grep -q /usr/bin/tor; then\n\
|
| 76 |
-
echo "Tor process is running"\n\
|
| 77 |
-
if nc -z localhost 9050; then\n\
|
| 78 |
-
echo "Tor SOCKS port is listening"\n\
|
| 79 |
-
break\n\
|
| 80 |
-
fi\n\
|
| 81 |
-
fi\n\
|
| 82 |
-
if [ $i -eq 30 ]; then\n\
|
| 83 |
-
echo "Warning: Tor might not be ready, but continuing..."\n\
|
| 84 |
-
fi\n\
|
| 85 |
-
sleep 1\n\
|
| 86 |
-
done\n\
|
| 87 |
-
\n\
|
| 88 |
-
# Verify Tor status\n\
|
| 89 |
-
echo "Checking Tor service status:"\n\
|
| 90 |
-
service tor status\n\
|
| 91 |
-
\n\
|
| 92 |
-
# Export API key if provided\n\
|
| 93 |
-
if [ ! -z "$OPENAI_API_KEY" ]; then\n\
|
| 94 |
-
export OPENAI_API_KEY=$OPENAI_API_KEY\n\
|
| 95 |
-
echo "OpenAI API key configured"\n\
|
| 96 |
-
fi\n\
|
| 97 |
-
\n\
|
| 98 |
-
if [ ! -z "$GOOGLE_API_KEY" ]; then\n\
|
| 99 |
-
export GOOGLE_API_KEY=$GOOGLE_API_KEY\n\
|
| 100 |
-
echo "Google API key configured"\n\
|
| 101 |
-
fi\n\
|
| 102 |
-
\n\
|
| 103 |
-
# Start the application with explicit host binding\n\
|
| 104 |
-
echo "Starting CyberScraper 2077..."\n\
|
| 105 |
-
streamlit run --server.address 0.0.0.0 --server.port 8501 main.py\n\
|
| 106 |
-
' > /app/run.sh
|
| 107 |
|
| 108 |
-
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
# Expose
|
| 111 |
-
EXPOSE
|
| 112 |
|
| 113 |
# Set the entrypoint
|
| 114 |
-
ENTRYPOINT ["
|
|
|
|
| 1 |
# Use Python 3.12 for better performance and compatibility
|
| 2 |
FROM python:3.12-slim-bookworm
|
| 3 |
|
| 4 |
+
# Set environment variables
|
| 5 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 6 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 7 |
+
PORT=7860 \
|
| 8 |
+
UV_SYSTEM_PYTHON=1 \
|
| 9 |
+
HOME=/home/user \
|
| 10 |
+
STREAMLIT_BROWSER_GATHER_USAGE_STATS=false \
|
| 11 |
+
STREAMLIT_SERVER_HEADLESS=true
|
| 12 |
|
| 13 |
+
# Install system dependencies
|
| 14 |
RUN apt-get update && apt-get install -y \
|
| 15 |
wget \
|
| 16 |
gnupg \
|
|
|
|
| 23 |
python3-dev \
|
| 24 |
libffi-dev \
|
| 25 |
procps \
|
| 26 |
+
nginx \
|
| 27 |
# Browser dependencies for Playwright/Patchright
|
| 28 |
libglib2.0-0 \
|
| 29 |
libnspr4 \
|
|
|
|
| 45 |
&& apt-get clean \
|
| 46 |
&& rm -rf /var/lib/apt/lists/*
|
| 47 |
|
| 48 |
+
# Install uv
|
| 49 |
+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
| 50 |
+
|
| 51 |
+
# Set up working directory
|
| 52 |
+
WORKDIR /app
|
| 53 |
+
|
| 54 |
+
# Copy requirements and install as root
|
| 55 |
+
COPY requirements.txt .
|
| 56 |
+
RUN uv pip install --system -r requirements.txt
|
| 57 |
+
RUN uv pip install --system fastapi uvicorn
|
| 58 |
+
|
| 59 |
+
# Install patchright browser
|
| 60 |
+
RUN patchright install chromium
|
| 61 |
+
|
| 62 |
+
# Create a non-root user
|
| 63 |
+
RUN useradd -m -u 1000 user
|
| 64 |
+
RUN mkdir -p /home/user/.streamlit && chown -R user:user /home/user
|
| 65 |
+
|
| 66 |
+
# Configure Tor
|
| 67 |
RUN echo "SocksPort 9050" >> /etc/tor/torrc && \
|
| 68 |
echo "ControlPort 9051" >> /etc/tor/torrc && \
|
| 69 |
echo "CookieAuthentication 1" >> /etc/tor/torrc && \
|
| 70 |
echo "DataDirectory /var/lib/tor" >> /etc/tor/torrc
|
| 71 |
|
| 72 |
+
# Set permissions for Tor, app directory, and nginx
|
| 73 |
+
RUN mkdir -p /var/lib/tor && \
|
| 74 |
+
chown -R user:user /var/lib/tor && \
|
| 75 |
+
chmod 700 /var/lib/tor && \
|
| 76 |
+
chown -R user:user /app && \
|
| 77 |
+
mkdir -p /var/log/nginx /var/lib/nginx /tmp && \
|
| 78 |
+
chown -R user:user /var/log/nginx /var/lib/nginx /tmp
|
| 79 |
|
| 80 |
+
# Copy the rest of the application
|
| 81 |
+
COPY --chown=user:user . .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
# Set permissions for the start script
|
| 84 |
+
RUN chmod +x start.sh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
# Switch to non-root user
|
| 87 |
+
USER user
|
| 88 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 89 |
|
| 90 |
+
# Expose port
|
| 91 |
+
EXPOSE 7860
|
| 92 |
|
| 93 |
# Set the entrypoint
|
| 94 |
+
ENTRYPOINT ["./start.sh"]
|
README.md
CHANGED
|
@@ -1,3 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# 🌐 CyberScraper 2077
|
| 2 |
|
| 3 |
<p align="center">
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Scraper Hub
|
| 3 |
+
emoji: 🌐
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
# 🌐 CyberScraper 2077
|
| 11 |
|
| 12 |
<p align="center">
|
api.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import asyncio
|
| 3 |
+
from fastapi import FastAPI, HTTPException
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from src.web_extractor import WebExtractor
|
| 7 |
+
from src.scrapers.playwright_scraper import ScraperConfig
|
| 8 |
+
|
| 9 |
+
app = FastAPI()
|
| 10 |
+
|
| 11 |
+
class ScrapeRequest(BaseModel):
|
| 12 |
+
url: str
|
| 13 |
+
query: str
|
| 14 |
+
model_name: Optional[str] = "alias-fast"
|
| 15 |
+
|
| 16 |
+
@app.get("/health")
|
| 17 |
+
async def health():
|
| 18 |
+
return {"status": "ok", "message": "CyberScraper 2077 API is running"}
|
| 19 |
+
|
| 20 |
+
@app.post("/api/scrape")
|
| 21 |
+
async def scrape(request: ScrapeRequest):
|
| 22 |
+
scraper_config = ScraperConfig(
|
| 23 |
+
headless=True,
|
| 24 |
+
max_retries=3,
|
| 25 |
+
delay_after_load=5
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
|
| 29 |
+
try:
|
| 30 |
+
# Construct the query by combining URL and the specific request
|
| 31 |
+
full_query = f"{request.url} {request.query}"
|
| 32 |
+
response = await extractor.process_query(full_query)
|
| 33 |
+
|
| 34 |
+
# If response is a tuple (csv/excel), extract the first part
|
| 35 |
+
if isinstance(response, tuple):
|
| 36 |
+
response = response[0]
|
| 37 |
+
|
| 38 |
+
return {
|
| 39 |
+
"url": request.url,
|
| 40 |
+
"query": request.query,
|
| 41 |
+
"response": response
|
| 42 |
+
}
|
| 43 |
+
except Exception as e:
|
| 44 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
import uvicorn
|
| 48 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
main.py
CHANGED
|
@@ -241,6 +241,11 @@ def check_service_status() -> dict:
|
|
| 241 |
"configured": bool(os.getenv("GOOGLE_API_KEY")),
|
| 242 |
"env_var": "GOOGLE_API_KEY"
|
| 243 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
"tor": {
|
| 245 |
"name": "Tor",
|
| 246 |
"configured": False, # Will be checked dynamically
|
|
@@ -442,7 +447,7 @@ def main():
|
|
| 442 |
st.session_state.current_chat_id = new_chat_id
|
| 443 |
save_chat_history(st.session_state.chat_history)
|
| 444 |
if 'selected_model' not in st.session_state:
|
| 445 |
-
st.session_state.selected_model = "
|
| 446 |
if 'web_scraper_chat' not in st.session_state:
|
| 447 |
st.session_state.web_scraper_chat = None
|
| 448 |
|
|
@@ -451,7 +456,7 @@ def main():
|
|
| 451 |
|
| 452 |
# Model selection
|
| 453 |
st.subheader("Select Model")
|
| 454 |
-
default_models = ["
|
| 455 |
ollama_models = st.session_state.get('ollama_models', [])
|
| 456 |
all_models = default_models + [f"ollama:{model}" for model in ollama_models]
|
| 457 |
selected_model = st.selectbox("Choose a model", all_models, index=all_models.index(st.session_state.selected_model) if st.session_state.selected_model in all_models else 0)
|
|
|
|
| 241 |
"configured": bool(os.getenv("GOOGLE_API_KEY")),
|
| 242 |
"env_var": "GOOGLE_API_KEY"
|
| 243 |
},
|
| 244 |
+
"blablador": {
|
| 245 |
+
"name": "Blablador",
|
| 246 |
+
"configured": bool(os.getenv("BLABLADOR_API_KEY")),
|
| 247 |
+
"env_var": "BLABLADOR_API_KEY"
|
| 248 |
+
},
|
| 249 |
"tor": {
|
| 250 |
"name": "Tor",
|
| 251 |
"configured": False, # Will be checked dynamically
|
|
|
|
| 447 |
st.session_state.current_chat_id = new_chat_id
|
| 448 |
save_chat_history(st.session_state.chat_history)
|
| 449 |
if 'selected_model' not in st.session_state:
|
| 450 |
+
st.session_state.selected_model = "alias-fast"
|
| 451 |
if 'web_scraper_chat' not in st.session_state:
|
| 452 |
st.session_state.web_scraper_chat = None
|
| 453 |
|
|
|
|
| 456 |
|
| 457 |
# Model selection
|
| 458 |
st.subheader("Select Model")
|
| 459 |
+
default_models = ["alias-fast", "alias-large", "gpt-4o-mini", "gemini-1.5-flash"]
|
| 460 |
ollama_models = st.session_state.get('ollama_models', [])
|
| 461 |
all_models = default_models + [f"ollama:{model}" for model in ollama_models]
|
| 462 |
selected_model = st.selectbox("Choose a model", all_models, index=all_models.index(st.session_state.selected_model) if st.session_state.selected_model in all_models else 0)
|
nginx.conf
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
worker_processes 1;
|
| 2 |
+
pid /tmp/nginx.pid;
|
| 3 |
+
|
| 4 |
+
events {
|
| 5 |
+
worker_connections 1024;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
http {
|
| 9 |
+
include /etc/nginx/mime.types;
|
| 10 |
+
default_type application/octet-stream;
|
| 11 |
+
|
| 12 |
+
client_body_temp_path /tmp/client_body;
|
| 13 |
+
proxy_temp_path /tmp/proxy_temp;
|
| 14 |
+
fastcgi_temp_path /tmp/fastcgi_temp;
|
| 15 |
+
uwsgi_temp_path /tmp/uwsgi_temp;
|
| 16 |
+
scgi_temp_path /tmp/scgi_temp;
|
| 17 |
+
|
| 18 |
+
access_log /tmp/access.log;
|
| 19 |
+
error_log /tmp/error.log;
|
| 20 |
+
|
| 21 |
+
server {
|
| 22 |
+
listen 7860;
|
| 23 |
+
server_name localhost;
|
| 24 |
+
|
| 25 |
+
location /api {
|
| 26 |
+
proxy_pass http://localhost:8000;
|
| 27 |
+
proxy_set_header Host $host;
|
| 28 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 29 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 30 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
location /health {
|
| 34 |
+
proxy_pass http://localhost:8000/health;
|
| 35 |
+
proxy_set_header Host $host;
|
| 36 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
location / {
|
| 40 |
+
proxy_pass http://localhost:8501;
|
| 41 |
+
proxy_http_version 1.1;
|
| 42 |
+
proxy_set_header Upgrade $http_upgrade;
|
| 43 |
+
proxy_set_header Connection "upgrade";
|
| 44 |
+
proxy_set_header Host $host;
|
| 45 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 46 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 47 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 48 |
+
proxy_read_timeout 86400;
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
}
|
src/models.py
CHANGED
|
@@ -38,5 +38,12 @@ class Models:
|
|
| 38 |
return OpenAI(model_name=model_name, **kwargs)
|
| 39 |
case name if name.startswith("gemini-"):
|
| 40 |
return ChatGoogleGenerativeAI(model=model_name, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
case _:
|
| 42 |
raise ValueError(f"Unsupported model: {model_name}")
|
|
|
|
| 38 |
return OpenAI(model_name=model_name, **kwargs)
|
| 39 |
case name if name.startswith("gemini-"):
|
| 40 |
return ChatGoogleGenerativeAI(model=model_name, **kwargs)
|
| 41 |
+
case "alias-large" | "alias-fast":
|
| 42 |
+
return ChatOpenAI(
|
| 43 |
+
model_name=model_name,
|
| 44 |
+
openai_api_key=os.getenv("BLABLADOR_API_KEY"),
|
| 45 |
+
openai_api_base="https://api.helmholtz-blablador.fz-juelich.de/v1",
|
| 46 |
+
**kwargs
|
| 47 |
+
)
|
| 48 |
case _:
|
| 49 |
raise ValueError(f"Unsupported model: {model_name}")
|
src/utils/error_handler.py
CHANGED
|
@@ -78,6 +78,15 @@ class ErrorMessages:
|
|
| 78 |
f"For help, see: {README_URL}#installation"
|
| 79 |
)
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
# Ollama errors
|
| 82 |
OLLAMA_NOT_RUNNING = (
|
| 83 |
"Ollama is not running or not accessible.\n\n"
|
|
@@ -188,4 +197,7 @@ def check_model_api_key(model_name: str) -> str | None:
|
|
| 188 |
if model_name.startswith("gemini-") and not os.getenv("GOOGLE_API_KEY"):
|
| 189 |
return ErrorMessages.GOOGLE_API_KEY_MISSING
|
| 190 |
|
|
|
|
|
|
|
|
|
|
| 191 |
return None
|
|
|
|
| 78 |
f"For help, see: {README_URL}#installation"
|
| 79 |
)
|
| 80 |
|
| 81 |
+
BLABLADOR_API_KEY_MISSING = (
|
| 82 |
+
"Blablador API Key is missing.\n\n"
|
| 83 |
+
"Please set the BLABLADOR_API_KEY environment variable:\n"
|
| 84 |
+
"1. Create a .env file in the project root\n"
|
| 85 |
+
"2. Add: BLABLADOR_API_KEY=your_key_here\n"
|
| 86 |
+
"3. Or export it: export BLABLADOR_API_KEY=your_key_here\n\n"
|
| 87 |
+
f"For setup instructions, see: {README_URL}#installation"
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
# Ollama errors
|
| 91 |
OLLAMA_NOT_RUNNING = (
|
| 92 |
"Ollama is not running or not accessible.\n\n"
|
|
|
|
| 197 |
if model_name.startswith("gemini-") and not os.getenv("GOOGLE_API_KEY"):
|
| 198 |
return ErrorMessages.GOOGLE_API_KEY_MISSING
|
| 199 |
|
| 200 |
+
if model_name.startswith("alias-") and not os.getenv("BLABLADOR_API_KEY"):
|
| 201 |
+
return ErrorMessages.BLABLADOR_API_KEY_MISSING
|
| 202 |
+
|
| 203 |
return None
|
start.sh
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Start Tor service in the background
|
| 4 |
+
echo "Starting Tor..."
|
| 5 |
+
tor &
|
| 6 |
+
|
| 7 |
+
# Wait for Tor to start
|
| 8 |
+
sleep 5
|
| 9 |
+
|
| 10 |
+
# Start FastAPI API in the background
|
| 11 |
+
echo "Starting FastAPI API..."
|
| 12 |
+
python3 api.py &
|
| 13 |
+
|
| 14 |
+
# Start Streamlit app in the background
|
| 15 |
+
echo "Starting Streamlit..."
|
| 16 |
+
streamlit run main.py --server.port 8501 --server.address 0.0.0.0 --server.enableCORS=false --server.enableXsrfProtection=false &
|
| 17 |
+
|
| 18 |
+
# Start Nginx in the foreground to keep the container running
|
| 19 |
+
echo "Starting Nginx..."
|
| 20 |
+
/usr/sbin/nginx -c /app/nginx.conf -g "daemon off;"
|
test_extractor.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from src.web_extractor import WebExtractor
|
| 3 |
+
from src.scrapers.playwright_scraper import ScraperConfig
|
| 4 |
+
|
| 5 |
+
async def test():
|
| 6 |
+
config = ScraperConfig(headless=True)
|
| 7 |
+
try:
|
| 8 |
+
extractor = WebExtractor(model_name="alias-fast", scraper_config=config)
|
| 9 |
+
print("WebExtractor initialized successfully!")
|
| 10 |
+
|
| 11 |
+
# Test URL extraction
|
| 12 |
+
from src.web_extractor import extract_url
|
| 13 |
+
url = extract_url("Check out https://example.com")
|
| 14 |
+
print(f"Extracted URL: {url}")
|
| 15 |
+
assert url == "https://example.com"
|
| 16 |
+
|
| 17 |
+
except Exception as e:
|
| 18 |
+
print(f"Error: {e}")
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
asyncio.run(test())
|
test_patchright.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from patchright.async_api import async_playwright
|
| 3 |
+
|
| 4 |
+
async def main():
|
| 5 |
+
async with async_playwright() as p:
|
| 6 |
+
browser = await p.chromium.launch(headless=True)
|
| 7 |
+
page = await browser.new_page()
|
| 8 |
+
await page.goto("https://example.com")
|
| 9 |
+
print(await page.title())
|
| 10 |
+
await browser.close()
|
| 11 |
+
|
| 12 |
+
if __name__ == "__main__":
|
| 13 |
+
asyncio.run(main())
|