Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,24 +8,21 @@ from urllib.parse import unquote, urlparse
|
|
| 8 |
|
| 9 |
app = FastAPI()
|
| 10 |
|
|
|
|
| 11 |
def convert_cookies_to_dict(cookies):
|
| 12 |
-
|
| 13 |
-
|
|
|
|
| 14 |
|
|
|
|
| 15 |
def get_root_domain(url):
|
| 16 |
-
# 解析URL
|
| 17 |
parsed_url = urlparse(url)
|
| 18 |
-
# 获取域名部分
|
| 19 |
domain = parsed_url.netloc
|
| 20 |
|
| 21 |
-
# 分割域名部分以获取根域名
|
| 22 |
-
# 假设根域名是域名的最后两个部分
|
| 23 |
parts = domain.split('.')
|
| 24 |
if len(parts) > 1:
|
| 25 |
-
# 返回根域名部分
|
| 26 |
return '.'.join(parts[-2:])
|
| 27 |
else:
|
| 28 |
-
# 如果域名部分少于两个部分,返回整个域名
|
| 29 |
return domain
|
| 30 |
|
| 31 |
@app.get("/")
|
|
@@ -60,39 +57,49 @@ def chrome(url:str=None,wait:int=5,header:str=None,cookie:str=None):
|
|
| 60 |
# 如果输入了cookie
|
| 61 |
if type(cookie) == str:
|
| 62 |
header_array.update({"cookie":unquote(cookie)})
|
| 63 |
-
|
|
|
|
| 64 |
options = Options()
|
|
|
|
|
|
|
| 65 |
options.add_argument('--headless')
|
| 66 |
-
|
|
|
|
| 67 |
driver = webdriver.Chrome(options=options)
|
| 68 |
|
|
|
|
| 69 |
driver.get(target_url)
|
| 70 |
|
|
|
|
| 71 |
if 'cookie' in header_array:
|
| 72 |
cookie_array = convert_cookies_to_dict(header_array['cookie'])
|
| 73 |
del header_array['cookie']
|
| 74 |
for key, value in cookie_array.items():
|
| 75 |
driver.add_cookie({"name": key, "value": value, "domain": f'.{target_domain}', "path": "/", "secure": False})
|
| 76 |
-
|
|
|
|
| 77 |
driver.header_overrides = header_array
|
| 78 |
-
|
|
|
|
| 79 |
driver.get(target_url)
|
| 80 |
|
|
|
|
| 81 |
print(driver.page_source)
|
| 82 |
|
|
|
|
| 83 |
if wait_time > 0:
|
| 84 |
time.sleep(wait_time)
|
| 85 |
|
| 86 |
-
#
|
| 87 |
current_url = driver.current_url
|
| 88 |
|
| 89 |
-
#
|
| 90 |
page_source = driver.page_source
|
| 91 |
|
| 92 |
-
#
|
| 93 |
cookies = driver.get_cookies()
|
| 94 |
|
| 95 |
-
#
|
| 96 |
is_jump = (target_url != current_url)
|
| 97 |
|
| 98 |
data = {
|
|
|
|
| 8 |
|
| 9 |
app = FastAPI()
|
| 10 |
|
| 11 |
+
# 解析cookie字符串为字典
|
| 12 |
def convert_cookies_to_dict(cookies):
|
| 13 |
+
cookie_items = cookies.split("; ")
|
| 14 |
+
parsed_cookies = {item.split("=", 1)[0].strip(): item.split("=", 1)[1].strip() if "=" in item else "" for item in cookie_items}
|
| 15 |
+
return parsed_cookies#
|
| 16 |
|
| 17 |
+
# 获取域名字符串的根域
|
| 18 |
def get_root_domain(url):
|
|
|
|
| 19 |
parsed_url = urlparse(url)
|
|
|
|
| 20 |
domain = parsed_url.netloc
|
| 21 |
|
|
|
|
|
|
|
| 22 |
parts = domain.split('.')
|
| 23 |
if len(parts) > 1:
|
|
|
|
| 24 |
return '.'.join(parts[-2:])
|
| 25 |
else:
|
|
|
|
| 26 |
return domain
|
| 27 |
|
| 28 |
@app.get("/")
|
|
|
|
| 57 |
# 如果输入了cookie
|
| 58 |
if type(cookie) == str:
|
| 59 |
header_array.update({"cookie":unquote(cookie)})
|
| 60 |
+
|
| 61 |
+
# 初始化浏览器
|
| 62 |
options = Options()
|
| 63 |
+
|
| 64 |
+
# 设置为无头模式
|
| 65 |
options.add_argument('--headless')
|
| 66 |
+
|
| 67 |
+
# 实例化
|
| 68 |
driver = webdriver.Chrome(options=options)
|
| 69 |
|
| 70 |
+
# 需要打开网址页面,才能用 driver.add_cookie 进行cookie追加
|
| 71 |
driver.get(target_url)
|
| 72 |
|
| 73 |
+
# 对浏览器追加指定域名的cookie
|
| 74 |
if 'cookie' in header_array:
|
| 75 |
cookie_array = convert_cookies_to_dict(header_array['cookie'])
|
| 76 |
del header_array['cookie']
|
| 77 |
for key, value in cookie_array.items():
|
| 78 |
driver.add_cookie({"name": key, "value": value, "domain": f'.{target_domain}', "path": "/", "secure": False})
|
| 79 |
+
|
| 80 |
+
# 覆写下次访问的请求头(没有修改的则保持原样)
|
| 81 |
driver.header_overrides = header_array
|
| 82 |
+
|
| 83 |
+
# 再次访问网址
|
| 84 |
driver.get(target_url)
|
| 85 |
|
| 86 |
+
# 输出此时访问的网页源码
|
| 87 |
print(driver.page_source)
|
| 88 |
|
| 89 |
+
# 等待多少秒,来预估网页完全的加载完成(执行完内部的所有js,因为部分js可能涉及到请求后的动态处理,或者延时跳转)
|
| 90 |
if wait_time > 0:
|
| 91 |
time.sleep(wait_time)
|
| 92 |
|
| 93 |
+
# 获取完全加载完成时,页面的URL
|
| 94 |
current_url = driver.current_url
|
| 95 |
|
| 96 |
+
# 获取完全加载完成时,页面的源代码
|
| 97 |
page_source = driver.page_source
|
| 98 |
|
| 99 |
+
# 获取完全加载完成时,页面的cookie
|
| 100 |
cookies = driver.get_cookies()
|
| 101 |
|
| 102 |
+
# 完全加载完成时,页面是否有发生过 301 302 跳转过
|
| 103 |
is_jump = (target_url != current_url)
|
| 104 |
|
| 105 |
data = {
|