Spaces:

rkihacker
/

Scrap

Paused

App Files Files Community

rkihacker commited on Sep 15

Commit

4b17916

verified ·

1 Parent(s): 8c635da

Create main.py

Browse files

Files changed (1) hide show

main.py +85 -0

main.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+from fastapi import FastAPI, HTTPException
+import requests
+from bs4 import BeautifulSoup
+import aiohttp
+# --- Configuration ---
+# It's recommended to use environment variables for sensitive data like API keys.
+# Replace with your actual API key and endpoint.
+LLM_API_URL = os.getenv("LLM_API_URL", "https://api.inference.net/v1/chat/completions")
+LLM_API_KEY = os.getenv("LLM_API_KEY", "inference-00050468cc1c4a20bd5ca0997c752329") # Replace with your key
+LLM_MODEL = "meta-llama/llama-3.1-8b-instruct/fp-8"
+app = FastAPI(
+    title="Web Scraper and AI Processor",
+    description="An API to scrape web content and process it with a large language model.",
+    version="1.0.0"
+)
+async def scrape_url(session, url: str):
+    """Asynchronously scrapes the text content from a given URL."""
+    try:
+        async with session.get(url, timeout=10) as response:
+            response.raise_for_status()
+            html_content = await response.text()
+            soup = BeautifulSoup(html_content, "html.parser")
+            # Remove script and style elements
+            for script_or_style in soup(["script", "style"]):
+                script_or_style.decompose()
+            # Get text and clean it up
+            text = soup.get_text()
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            return " ".join(chunk for chunk in chunks if chunk)
+    except requests.exceptions.RequestException as e:
+        raise HTTPException(status_code=400, detail=f"Error fetching the URL: {e}")
+async def process_with_llm(session, content: str, query: str):
+    """Sends the scraped content and a query to the LLM for processing."""
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {LLM_API_KEY}",
+    }
+    data = {
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant that analyzes web content."
+            },
+            {
+                "role": "user",
+                "content": f"Based on the following content, please answer this question: '{query}'\n\nContent:\n{content}"
+            }
+        ],
+        "model": LLM_MODEL,
+        "stream": False  # Set to False for a single response
+    }
+    try:
+        async with session.post(LLM_API_URL, headers=headers, json=data, timeout=30) as response:
+            response.raise_for_status()
+            return await response.json()
+    except aiohttp.ClientError as e:
+        raise HTTPException(status_code=500, detail=f"Error communicating with the LLM API: {e}")
+@app.post("/scrape-and-process/")
+async def scrape_and_process(url: str, query: str):
+    """
+    Scrapes a URL, sends the content to a large language model with a query,
+    and returns the model's response.
+    """
+    async with aiohttp.ClientSession() as session:
+        scraped_content = await scrape_url(session, url)
+        if not scraped_content:
+            raise HTTPException(status_code=404, detail="Could not scrape any content from the URL.")
+        llm_response = await process_with_llm(session, scraped_content, query)
+        return llm_response
+@app.get("/")
+def read_root():
+    return {"message": "Welcome to the Web Scraper and AI Processor API."}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)