Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
run.py
CHANGED
|
@@ -84,7 +84,122 @@ def format_msg(it, init_msg, pdf_obs, warn_obs, web_img_b64, web_text):
|
|
| 84 |
]
|
| 85 |
}
|
| 86 |
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
def webvoyager_run(args, task, task_dir):
|
| 90 |
"""
|
|
@@ -133,7 +248,8 @@ def webvoyager_run(args, task, task_dir):
|
|
| 133 |
|
| 134 |
action_key, info = extract_information(gpt_4v_res.split('Action:')[1].strip())
|
| 135 |
|
| 136 |
-
# ... (action execution logic
|
|
|
|
| 137 |
if action_key == 'answer':
|
| 138 |
break
|
| 139 |
|
|
|
|
| 84 |
]
|
| 85 |
}
|
| 86 |
|
| 87 |
+
def call_gpt4v_api(args, openai_client, messages):
|
| 88 |
+
retry_times = 0
|
| 89 |
+
while True:
|
| 90 |
+
try:
|
| 91 |
+
if not args.text_only:
|
| 92 |
+
logging.info('Calling gpt4v API...')
|
| 93 |
+
openai_response = openai_client.chat.completions.create(
|
| 94 |
+
model=args.api_model, messages=messages, max_tokens=1000, seed=args.seed
|
| 95 |
+
)
|
| 96 |
+
else:
|
| 97 |
+
logging.info('Calling gpt4 API...')
|
| 98 |
+
openai_response = openai_client.chat.completions.create(
|
| 99 |
+
model=args.api_model, messages=messages, max_tokens=1000, seed=args.seed, timeout=30
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
prompt_tokens = openai_response.usage.prompt_tokens
|
| 103 |
+
completion_tokens = openai_response.usage.completion_tokens
|
| 104 |
+
|
| 105 |
+
logging.info(f'Prompt Tokens: {prompt_tokens}; Completion Tokens: {completion_tokens}')
|
| 106 |
+
|
| 107 |
+
gpt_call_error = False
|
| 108 |
+
return prompt_tokens, completion_tokens, gpt_call_error, openai_response
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logging.info(f'Error occurred, retrying. Error type: {type(e).__name__}')
|
| 112 |
+
|
| 113 |
+
if type(e).__name__ == 'RateLimitError':
|
| 114 |
+
time.sleep(10)
|
| 115 |
+
|
| 116 |
+
elif type(e).__name__ == 'APIError':
|
| 117 |
+
time.sleep(15)
|
| 118 |
+
|
| 119 |
+
elif type(e).__name__ == 'InvalidRequestError':
|
| 120 |
+
gpt_call_error = True
|
| 121 |
+
return None, None, gpt_call_error, None
|
| 122 |
+
|
| 123 |
+
else:
|
| 124 |
+
gpt_call_error = True
|
| 125 |
+
return None, None, gpt_call_error, None
|
| 126 |
+
|
| 127 |
+
retry_times += 1
|
| 128 |
+
if retry_times == 10:
|
| 129 |
+
logging.info('Retrying too many times')
|
| 130 |
+
return None, None, True, None
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def exec_action_click(info, web_ele, driver_task):
|
| 134 |
+
driver_task.execute_script("arguments[0].setAttribute('target', '_self')", web_ele)
|
| 135 |
+
web_ele.click()
|
| 136 |
+
time.sleep(3)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def exec_action_type(info, web_ele, driver_task):
|
| 140 |
+
warn_obs = ""
|
| 141 |
+
type_content = info['content']
|
| 142 |
+
|
| 143 |
+
ele_tag_name = web_ele.tag_name.lower()
|
| 144 |
+
ele_type = web_ele.get_attribute("type")
|
| 145 |
+
# outer_html = web_ele.get_attribute("outerHTML")
|
| 146 |
+
if (ele_tag_name != 'input' and ele_tag_name != 'textarea') or (ele_tag_name == 'input' and ele_type not in ['text', 'search', 'password', 'email', 'tel']):
|
| 147 |
+
warn_obs = f"note: The web element you're trying to type may not be a textbox, and its tag name is <{web_ele.tag_name}>, type is {ele_type}."
|
| 148 |
+
try:
|
| 149 |
+
# Not always work to delete
|
| 150 |
+
web_ele.clear()
|
| 151 |
+
# Another way to delete
|
| 152 |
+
if platform.system() == 'Darwin':
|
| 153 |
+
web_ele.send_keys(Keys.COMMAND + "a")
|
| 154 |
+
else:
|
| 155 |
+
web_ele.send_keys(Keys.CONTROL + "a")
|
| 156 |
+
web_ele.send_keys(" ")
|
| 157 |
+
web_ele.send_keys(Keys.BACKSPACE)
|
| 158 |
+
except:
|
| 159 |
+
pass
|
| 160 |
+
|
| 161 |
+
actions = ActionChains(driver_task)
|
| 162 |
+
actions.click(web_ele).perform()
|
| 163 |
+
actions.pause(1)
|
| 164 |
+
|
| 165 |
+
try:
|
| 166 |
+
driver_task.execute_script("""window.onkeydown = function(e) {if(e.keyCode == 32 && e.target.type != 'text' && e.target.type != 'textarea' && e.target.type != 'search') {e.preventDefault();}};""")
|
| 167 |
+
except:
|
| 168 |
+
pass
|
| 169 |
+
|
| 170 |
+
actions.send_keys(type_content)
|
| 171 |
+
actions.pause(2)
|
| 172 |
+
|
| 173 |
+
actions.send_keys(Keys.ENTER)
|
| 174 |
+
actions.perform()
|
| 175 |
+
time.sleep(10)
|
| 176 |
+
return warn_obs
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def exec_action_scroll(info, web_eles, driver_task, args, obs_info):
|
| 180 |
+
scroll_ele_number = info['number']
|
| 181 |
+
scroll_content = info['content']
|
| 182 |
+
if scroll_ele_number == "WINDOW":
|
| 183 |
+
if scroll_content == 'down':
|
| 184 |
+
driver_task.execute_script(f"window.scrollBy(0, {args.window_height*2//3});")
|
| 185 |
+
else:
|
| 186 |
+
driver_task.execute_script(f"window.scrollBy(0, {-args.window_height*2//3});")
|
| 187 |
+
else:
|
| 188 |
+
if not args.text_only:
|
| 189 |
+
scroll_ele_number = int(scroll_ele_number)
|
| 190 |
+
web_ele = web_eles[scroll_ele_number]
|
| 191 |
+
else:
|
| 192 |
+
element_box = obs_info[scroll_ele_number]['union_bound']
|
| 193 |
+
element_box_center = (element_box[0] + element_box[2] // 2, element_box[1] + element_box[3] // 2)
|
| 194 |
+
web_ele = driver_task.execute_script("return document.elementFromPoint(arguments[0], arguments[1]);", element_box_center[0], element_box_center[1])
|
| 195 |
+
actions = ActionChains(driver_task)
|
| 196 |
+
driver_task.execute_script("arguments[0].focus();", web_ele)
|
| 197 |
+
if scroll_content == 'down':
|
| 198 |
+
actions.key_down(Keys.ALT).send_keys(Keys.ARROW_DOWN).key_up(Keys.ALT).perform()
|
| 199 |
+
else:
|
| 200 |
+
actions.key_down(Keys.ALT).send_keys(Keys.ARROW_UP).key_up(Keys.ALT).perform()
|
| 201 |
+
time.sleep(3)
|
| 202 |
+
|
| 203 |
|
| 204 |
def webvoyager_run(args, task, task_dir):
|
| 205 |
"""
|
|
|
|
| 248 |
|
| 249 |
action_key, info = extract_information(gpt_4v_res.split('Action:')[1].strip())
|
| 250 |
|
| 251 |
+
# ... (action execution logic)
|
| 252 |
+
|
| 253 |
if action_key == 'answer':
|
| 254 |
break
|
| 255 |
|