Aniket00 commited on
Commit
b4ccc57
Β·
verified Β·
1 Parent(s): 3066a5e

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +113 -0
  2. main.py +22 -0
  3. requirement.txt +15 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ from crawl4ai import AsyncWebCrawler
4
+ from urllib.parse import urlparse
5
+ from langchain_community.document_loaders import TextLoader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain.schema.runnable import RunnableMap, RunnablePassthrough
11
+ from langchain.schema.output_parser import StrOutputParser
12
+ from langchain_groq import ChatGroq
13
+ import re
14
+ import os
15
+ from dotenv import load_dotenv
16
+ load_dotenv()
17
+
18
+
19
+ GROQ_API_KEY=os.getenv("GROQ_API_KEY")
20
+
21
+ qa_chain = None
22
+ scraped_file = None
23
+
24
+ # Clean LLM output
25
+ class StrictOutputParser(StrOutputParser):
26
+ def parse(self, text: str) -> str:
27
+ text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
28
+ text = re.sub(r'^(Reasoning|Thought|Analysis):.*?\n', '', text, flags=re.IGNORECASE)
29
+ return text.strip()
30
+
31
+ # Async crawl function
32
+ async def crawl_site(url):
33
+ async with AsyncWebCrawler() as crawler:
34
+ result = await crawler.arun(url=url)
35
+ return result.markdown
36
+
37
+ # UI-triggered scraper
38
+ def scrape_website(url):
39
+ global scraped_file
40
+ markdown = asyncio.run(crawl_site(url))
41
+ domain = urlparse(url).netloc.replace("www.", "")
42
+ filename = f"{domain}.txt"
43
+ with open(filename, "w", encoding="utf-8") as f:
44
+ f.write(markdown)
45
+ scraped_file = filename
46
+ return filename, markdown
47
+
48
+ # Query setup
49
+ def setup_qa():
50
+ global qa_chain
51
+ loader = TextLoader(scraped_file, encoding="utf-8")
52
+ docs = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(loader.load())
53
+ vectorstore = FAISS.from_documents(docs, HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))
54
+ retriever = vectorstore.as_retriever()
55
+ prompt = PromptTemplate.from_template("""
56
+ You are an AI assistant. Return ONLY the final answer.
57
+
58
+ **Rules (MUST follow):**
59
+ 1. NO <think>, reasoning, or explanations.
60
+ 2. NO markdown/formatting tags.
61
+ 3. Answer in 3-4 concise sentences.
62
+
63
+ Context:
64
+ {context}
65
+
66
+ Question:
67
+ {question}
68
+
69
+ Answer (direct and short):""")
70
+
71
+ llm = ChatGroq(
72
+ api_key=GROQ_API_KEY, # Use environment variable for security
73
+ model="deepseek-r1-distill-llama-70b",
74
+ temperature=0.0
75
+ )
76
+
77
+ qa_chain = (
78
+ RunnableMap({
79
+ "context": retriever,
80
+ "question": RunnablePassthrough()
81
+ }) | prompt | llm | StrictOutputParser()
82
+ )
83
+ return "βœ… Query system ready!"
84
+
85
+ # Handle questions
86
+ def ask_question(query):
87
+ if not qa_chain:
88
+ return "❗ Please set up the QA system first."
89
+ return qa_chain.invoke(query)
90
+
91
+ # Gradio interface
92
+ with gr.Blocks(title="Web Scraping AI Agent") as demo:
93
+ gr.Markdown("## 🌐 Website Scraper AI Agent")
94
+
95
+ url_input = gr.Textbox(label="Enter Website URL")
96
+ scrape_btn = gr.Button("πŸ” Scrape Website")
97
+ download_output = gr.File(label="πŸ“„ Download Scraped File")
98
+ markdown_box = gr.Textbox(label="Scraped Text", lines=10)
99
+
100
+ setup_btn = gr.Button("πŸ’¬ Query This Website")
101
+ setup_status = gr.Textbox(label="Status")
102
+
103
+ query_input = gr.Textbox(label="Ask a Question")
104
+ query_btn = gr.Button("Ask")
105
+ query_output = gr.Textbox(label="Answer")
106
+
107
+ # Wire components
108
+ scrape_btn.click(fn=scrape_website, inputs=[url_input], outputs=[download_output, markdown_box])
109
+ setup_btn.click(fn=setup_qa, outputs=setup_status)
110
+ query_btn.click(fn=ask_question, inputs=[query_input], outputs=[query_output])
111
+
112
+ # Run
113
+ demo.launch(share=True)
main.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from crawl4ai import *
3
+ from urllib.parse import urlparse
4
+
5
+
6
+ url = url = input("Enter the website URL: ").strip()
7
+ async def main():
8
+ async with AsyncWebCrawler() as crawler:
9
+ result = await crawler.arun(
10
+ url=url,
11
+ )
12
+ print(result.markdown)
13
+
14
+ domain = urlparse(url).netloc.replace("www.", "")
15
+ filename = f"{domain}.txt"
16
+ with open(filename, "w", encoding="utf-8") as f:
17
+ f.write(result.markdown)
18
+
19
+ print(f"\nβœ… Scraped content saved to '{filename}'")
20
+
21
+ if __name__ == "__main__":
22
+ asyncio.run(main())
requirement.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ asyncio
3
+ crawl4ai
4
+ urllib3
5
+ langchain
6
+ langchain-core
7
+ langchain-community
8
+ langchain-huggingface
9
+ langchain-groq
10
+ huggingface-hub
11
+ sentence-transformers
12
+ faiss-cpu
13
+ python-dotenv
14
+ aiohttp
15
+ re