Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import asyncio | |
| from crawl4ai import AsyncWebCrawler | |
| from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig | |
| from langchain_core.documents.base import Document | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain_huggingface.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores.chroma import Chroma | |
| from langchain_huggingface.chat_models import ChatHuggingFace | |
| from langchain_huggingface.llms import HuggingFaceEndpoint | |
| import os | |
| # ------------------------------------------------------------------------------ | |
| # Set your API tokens | |
| # ------------------------------------------------------------------------------ | |
| os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("key") | |
| os.environ['HF_TOKEN'] = os.getenv("key") | |
| # ------------------------------------------------------------------------------ | |
| # Streamlit App | |
| # ------------------------------------------------------------------------------ | |
| st.title("Web Crawler + Semantic Search + Conversational Model") | |
| # Input for the website to crawl | |
| url = st.text_input("Enter a website URL to crawl:") | |
| # Input for semantic search | |
| query = st.text_input("Enter your semantic search query:") | |
| # Button to start the process | |
| if st.button("Analyze and Query"): | |
| if not url or not query: | |
| st.error("Please provide both a URL and a semantic search query.") | |
| else: | |
| with st.spinner("Crawling website, retrieving documents, and generating a response..."): | |
| async def main(): | |
| # Crawling | |
| browser_config = BrowserConfig() | |
| run_config = CrawlerRunConfig() | |
| async with AsyncWebCrawler(config=browser_config) as crawler: | |
| result = await crawler.arun(url=url, config=run_config) | |
| doc = Document(page_content=result.markdown.raw_markdown) | |
| # Split documents into chunks | |
| text_splitter = CharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=100, | |
| ) | |
| chunks = text_splitter.split_documents([doc]) | |
| # Embedding and Vector Store | |
| emb = HuggingFaceEmbeddings(model='avsolatorio/GIST-small-Embedding-v0') | |
| db = Chroma.from_documents(chunks, emb, persist_directory='chroma_db') | |
| docs = db.similarity_search(query, k=3) | |
| context = " ".join([d.page_content for d in docs]) | |
| # Prepare and call the chat model | |
| deepseek_endpoint = HuggingFaceEndpoint( | |
| repo_id='deepseek-ai/DeepSeek-Prover-V2-671B', | |
| provider='sambanova', | |
| temperature=0.5, | |
| max_new_tokens=50, | |
| task='conversational' | |
| ) | |
| deep_seek = ChatHuggingFace( | |
| llm=deepseek_endpoint, | |
| repo_id='deepseek-ai/DeepSeek-Prover-V2-671B', | |
| provider='sambanova', | |
| temperature=0.5, | |
| max_new_tokens=50, | |
| task='conversational' | |
| ) | |
| message = f"""Context:\n{context}\nQuestion:\n{query}""" | |
| response = deep_seek.invoke([{"role": "user", "content": message}]) | |
| return response.content | |
| response = asyncio.run(main()) | |
| st.success("Done.") | |
| st.write("**Response from Model:**") | |
| st.write(response) | |
| # ------------------------------------------------------------------------------ | |
| # End of Streamlit App | |
| # ------------------------------------------------------------------------------ | |