|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import requests |
|
|
from config import CONTENT_EXTRACTION, SEARCH_SELECTION |
|
|
from src.core.web_loader import web_loader |
|
|
|
|
|
class BrowserEngine: |
|
|
def __init__(self, configuration): |
|
|
self.config = configuration |
|
|
|
|
|
def generate_headers(self): |
|
|
ipv4 = web_loader.get_ipv4() |
|
|
ipv6 = web_loader.get_ipv6() |
|
|
user_agent = web_loader.get_user_agent() |
|
|
origin = web_loader.get_origin() |
|
|
referrer = web_loader.get_referrer() |
|
|
location = web_loader.get_location() |
|
|
|
|
|
return { |
|
|
"User-Agent": user_agent, |
|
|
"X-Forwarded-For": f"{ipv4}, {ipv6}", |
|
|
"X-Real-IP": ipv4, |
|
|
"X-Originating-IP": ipv4, |
|
|
"X-Remote-IP": ipv4, |
|
|
"X-Remote-Addr": ipv4, |
|
|
"X-Client-IP": ipv4, |
|
|
"X-Forwarded-Host": origin.replace("https://", "").replace("http://", ""), |
|
|
"Origin": origin, |
|
|
"Referer": referrer, |
|
|
"Accept-Language": f"{location['language']},en;q=0.9", |
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
|
|
"Accept-Encoding": "gzip, deflate, br", |
|
|
"DNT": "1", |
|
|
"Connection": "keep-alive", |
|
|
"Upgrade-Insecure-Requests": "1", |
|
|
"Sec-Fetch-Dest": "document", |
|
|
"Sec-Fetch-Mode": "navigate", |
|
|
"Sec-Fetch-Site": "cross-site", |
|
|
"Sec-Fetch-User": "?1", |
|
|
"Cache-Control": "max-age=0", |
|
|
"X-Country": location['country'], |
|
|
"X-Timezone": location['timezone'] |
|
|
} |
|
|
|
|
|
def extract_page_content(self, target_url: str) -> str: |
|
|
try: |
|
|
headers = self.generate_headers() |
|
|
payload = { |
|
|
"url": target_url |
|
|
} |
|
|
request_response = requests.post( |
|
|
self.config.content_reader_api, |
|
|
data=payload, |
|
|
headers=headers, |
|
|
timeout=self.config.request_timeout, |
|
|
) |
|
|
request_response.raise_for_status() |
|
|
extracted_content = request_response.text |
|
|
return f"{extracted_content}{CONTENT_EXTRACTION}" |
|
|
except Exception as error: |
|
|
return f"Error reading URL: {str(error)}" |
|
|
|
|
|
def perform_search(self, search_query: str, search_provider: str = "google") -> str: |
|
|
try: |
|
|
headers = self.generate_headers() |
|
|
|
|
|
if search_provider == "baidu": |
|
|
full_url = f"{self.config.content_reader_api}{self.config.baidu_endpoint}?wd={requests.utils.quote(search_query)}" |
|
|
headers["X-Target-Selector"] = "#content_left" |
|
|
else: |
|
|
provider_prefix = "!go" if search_provider == "google" else "!bi" |
|
|
encoded_query = requests.utils.quote(f"{provider_prefix} {search_query}") |
|
|
full_url = f"{self.config.content_reader_api}{self.config.searxng_endpoint}?q={encoded_query}" |
|
|
headers["X-Target-Selector"] = "#urls" |
|
|
|
|
|
search_response = requests.get( |
|
|
full_url, |
|
|
headers=headers, |
|
|
timeout=self.config.request_timeout |
|
|
) |
|
|
search_response.raise_for_status() |
|
|
search_results = search_response.text |
|
|
return f"{search_results}{SEARCH_SELECTION}" |
|
|
except Exception as error: |
|
|
return f"Error during search: {str(error)}" |