Spaces:
Running
Running
| import streamlit as st | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| from requests.sessions import Session | |
| from langdetect import detect | |
| from googletrans import Translator | |
| def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None): | |
| try: | |
| session = Session() | |
| # Handle authentication if credentials are provided | |
| if email and password and login_url: | |
| login_data = { | |
| 'email': email, | |
| 'password': password | |
| # Include other necessary fields as required by the website | |
| } | |
| response = session.post(login_url, data=login_data) | |
| response.raise_for_status() | |
| else: | |
| response = session.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Remove unwanted tags | |
| for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]): | |
| tag.extract() | |
| # Use query selector if provided | |
| if query_selector: | |
| elements = soup.select(query_selector) | |
| text_content = " ".join([element.get_text() for element in elements]) | |
| else: | |
| # Extract header content | |
| header_content = soup.find("header") | |
| header_text = header_content.get_text() if header_content else "" | |
| # Extract paragraph content | |
| paragraph_content = soup.body | |
| paragraph_text = " ".join([p.get_text() for p in paragraph_content]) | |
| text_content = f"{header_text}\n\n{paragraph_text}" | |
| # Clean up whitespace | |
| visible_text = re.sub(r'\s+', ' ', text_content).strip() | |
| # Translate non-English text | |
| translator = Translator() | |
| sentences = re.split(r'(?<=[.!?]) +', visible_text) | |
| translated_sentences = [] | |
| for sentence in sentences: | |
| try: | |
| lang = detect(sentence) | |
| if lang != 'en': | |
| translation = translator.translate(sentence, dest='en').text | |
| translated_sentences.append(translation) | |
| else: | |
| translated_sentences.append(sentence) | |
| except Exception: | |
| translated_sentences.append(sentence) | |
| translated_text = ' '.join(translated_sentences) | |
| return translated_text | |
| except Exception as e: | |
| st.error(f"Error occurred while scraping the data: {e}") | |
| return None | |
| def main(): | |
| st.title("π Web Data Scraper") | |
| url_input = st.text_input("Enter the URL :", "") | |
| query_selector = st.text_input("Enter a query selector (optional):", "") | |
| email = st.text_input("Email (if authentication required):", "") | |
| password = st.text_input("Password (if authentication required):", "", type="password") | |
| login_url = st.text_input("Enter the login URL (if authentication required):", "") | |
| if st.button("πΏ Load Data"): | |
| if url_input: | |
| data = scrape_visible_text_from_url( | |
| url=url_input, | |
| query_selector=query_selector if query_selector else None, | |
| email=email if email else None, | |
| password=password if password else None, | |
| login_url=login_url if login_url else None | |
| ) | |
| if data: | |
| st.success("Data text successfully scraped!") | |
| st.subheader("Scraped Text:") | |
| st.write(data) | |
| else: | |
| st.warning("Failed to load data from the URL.") | |
| else: | |
| st.warning("Please enter a valid URL.") | |
| if __name__ == "__main__": | |
| main() |