Spaces:

DOMMETI
/

Web_Crawler

Sleeping

App Files Files Community

DOMMETI commited on Jun 16

Commit

5d8edc1

verified ·

1 Parent(s): 95b707a

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +95 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,97 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from langchain_core.documents.base import Document
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores.chroma import Chroma
+from langchain_huggingface.chat_models import ChatHuggingFace
+from langchain_huggingface.llms import HuggingFaceEndpoint
+import os
+# ------------------------------------------------------------------------------
+# Set your API tokens
+# ------------------------------------------------------------------------------
+os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("key")
+os.environ['HF_TOKEN'] = os.getenv("key")
+# ------------------------------------------------------------------------------
+# Streamlit App
+# ------------------------------------------------------------------------------
+st.title("Web Crawler + Semantic Search + Conversational Model")
+# Input for the website to crawl
+url = st.text_input("Enter a website URL to crawl:")
+# Input for semantic search
+query = st.text_input("Enter your semantic search query:")
+# Button to start the process
+if st.button("Analyze and Query"):
+    if not url or not query:
+        st.error("Please provide both a URL and a semantic search query.")
+    else:
+        with st.spinner("Crawling website, retrieving documents, and generating a response..."):
+            async def main():
+                # Crawling
+                browser_config = BrowserConfig()
+                run_config = CrawlerRunConfig()
+                async with AsyncWebCrawler(config=browser_config) as crawler:
+                    result = await crawler.arun(url=url, config=run_config)
+                    doc = Document(page_content=result.markdown.raw_markdown)
+                # Split documents into chunks
+                text_splitter = CharacterTextSplitter(
+                    chunk_size=1000,
+                    chunk_overlap=100,
+                )
+                chunks = text_splitter.split_documents([doc])
+                # Embedding and Vector Store
+                emb = HuggingFaceEmbeddings(model='avsolatorio/GIST-small-Embedding-v0')
+                db = Chroma.from_documents(chunks, emb, persist_directory='chroma_db')
+                docs = db.similarity_search(query, k=3)
+                context = " ".join([d.page_content for d in docs])
+                # Prepare and call the chat model
+                deepseek_endpoint = HuggingFaceEndpoint(
+                    repo_id='deepseek-ai/DeepSeek-Prover-V2-671B',
+                    provider='sambanova',
+                    temperature=0.5,
+                    max_new_tokens=50,
+                    task='conversational'
+                )
+                deep_seek = ChatHuggingFace(
+                    llm=deepseek_endpoint,
+                    repo_id='deepseek-ai/DeepSeek-Prover-V2-671B',
+                    provider='sambanova',
+                    temperature=0.5,
+                    max_new_tokens=50,
+                    task='conversational'
+                )
+                message = f"""Context:\n{context}\nQuestion:\n{query}"""
+                response = deep_seek.invoke([{"role": "user", "content": message}])
+                return response.content
+            response = asyncio.run(main())
+            st.success("Done.")
+            st.write("**Response from Model:**")
+            st.write(response)
+# ------------------------------------------------------------------------------
+# End of Streamlit App
+# ------------------------------------------------------------------------------