harvesthealth commited on
Commit
40dd678
·
verified ·
1 Parent(s): 663480b

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. run.py +118 -2
run.py CHANGED
@@ -84,7 +84,122 @@ def format_msg(it, init_msg, pdf_obs, warn_obs, web_img_b64, web_text):
84
  ]
85
  }
86
 
87
- # ... (rest of the helper functions remain the same)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  def webvoyager_run(args, task, task_dir):
90
  """
@@ -133,7 +248,8 @@ def webvoyager_run(args, task, task_dir):
133
 
134
  action_key, info = extract_information(gpt_4v_res.split('Action:')[1].strip())
135
 
136
- # ... (action execution logic remains the same)
 
137
  if action_key == 'answer':
138
  break
139
 
 
84
  ]
85
  }
86
 
87
+ def call_gpt4v_api(args, openai_client, messages):
88
+ retry_times = 0
89
+ while True:
90
+ try:
91
+ if not args.text_only:
92
+ logging.info('Calling gpt4v API...')
93
+ openai_response = openai_client.chat.completions.create(
94
+ model=args.api_model, messages=messages, max_tokens=1000, seed=args.seed
95
+ )
96
+ else:
97
+ logging.info('Calling gpt4 API...')
98
+ openai_response = openai_client.chat.completions.create(
99
+ model=args.api_model, messages=messages, max_tokens=1000, seed=args.seed, timeout=30
100
+ )
101
+
102
+ prompt_tokens = openai_response.usage.prompt_tokens
103
+ completion_tokens = openai_response.usage.completion_tokens
104
+
105
+ logging.info(f'Prompt Tokens: {prompt_tokens}; Completion Tokens: {completion_tokens}')
106
+
107
+ gpt_call_error = False
108
+ return prompt_tokens, completion_tokens, gpt_call_error, openai_response
109
+
110
+ except Exception as e:
111
+ logging.info(f'Error occurred, retrying. Error type: {type(e).__name__}')
112
+
113
+ if type(e).__name__ == 'RateLimitError':
114
+ time.sleep(10)
115
+
116
+ elif type(e).__name__ == 'APIError':
117
+ time.sleep(15)
118
+
119
+ elif type(e).__name__ == 'InvalidRequestError':
120
+ gpt_call_error = True
121
+ return None, None, gpt_call_error, None
122
+
123
+ else:
124
+ gpt_call_error = True
125
+ return None, None, gpt_call_error, None
126
+
127
+ retry_times += 1
128
+ if retry_times == 10:
129
+ logging.info('Retrying too many times')
130
+ return None, None, True, None
131
+
132
+
133
+ def exec_action_click(info, web_ele, driver_task):
134
+ driver_task.execute_script("arguments[0].setAttribute('target', '_self')", web_ele)
135
+ web_ele.click()
136
+ time.sleep(3)
137
+
138
+
139
+ def exec_action_type(info, web_ele, driver_task):
140
+ warn_obs = ""
141
+ type_content = info['content']
142
+
143
+ ele_tag_name = web_ele.tag_name.lower()
144
+ ele_type = web_ele.get_attribute("type")
145
+ # outer_html = web_ele.get_attribute("outerHTML")
146
+ if (ele_tag_name != 'input' and ele_tag_name != 'textarea') or (ele_tag_name == 'input' and ele_type not in ['text', 'search', 'password', 'email', 'tel']):
147
+ warn_obs = f"note: The web element you're trying to type may not be a textbox, and its tag name is <{web_ele.tag_name}>, type is {ele_type}."
148
+ try:
149
+ # Not always work to delete
150
+ web_ele.clear()
151
+ # Another way to delete
152
+ if platform.system() == 'Darwin':
153
+ web_ele.send_keys(Keys.COMMAND + "a")
154
+ else:
155
+ web_ele.send_keys(Keys.CONTROL + "a")
156
+ web_ele.send_keys(" ")
157
+ web_ele.send_keys(Keys.BACKSPACE)
158
+ except:
159
+ pass
160
+
161
+ actions = ActionChains(driver_task)
162
+ actions.click(web_ele).perform()
163
+ actions.pause(1)
164
+
165
+ try:
166
+ driver_task.execute_script("""window.onkeydown = function(e) {if(e.keyCode == 32 && e.target.type != 'text' && e.target.type != 'textarea' && e.target.type != 'search') {e.preventDefault();}};""")
167
+ except:
168
+ pass
169
+
170
+ actions.send_keys(type_content)
171
+ actions.pause(2)
172
+
173
+ actions.send_keys(Keys.ENTER)
174
+ actions.perform()
175
+ time.sleep(10)
176
+ return warn_obs
177
+
178
+
179
+ def exec_action_scroll(info, web_eles, driver_task, args, obs_info):
180
+ scroll_ele_number = info['number']
181
+ scroll_content = info['content']
182
+ if scroll_ele_number == "WINDOW":
183
+ if scroll_content == 'down':
184
+ driver_task.execute_script(f"window.scrollBy(0, {args.window_height*2//3});")
185
+ else:
186
+ driver_task.execute_script(f"window.scrollBy(0, {-args.window_height*2//3});")
187
+ else:
188
+ if not args.text_only:
189
+ scroll_ele_number = int(scroll_ele_number)
190
+ web_ele = web_eles[scroll_ele_number]
191
+ else:
192
+ element_box = obs_info[scroll_ele_number]['union_bound']
193
+ element_box_center = (element_box[0] + element_box[2] // 2, element_box[1] + element_box[3] // 2)
194
+ web_ele = driver_task.execute_script("return document.elementFromPoint(arguments[0], arguments[1]);", element_box_center[0], element_box_center[1])
195
+ actions = ActionChains(driver_task)
196
+ driver_task.execute_script("arguments[0].focus();", web_ele)
197
+ if scroll_content == 'down':
198
+ actions.key_down(Keys.ALT).send_keys(Keys.ARROW_DOWN).key_up(Keys.ALT).perform()
199
+ else:
200
+ actions.key_down(Keys.ALT).send_keys(Keys.ARROW_UP).key_up(Keys.ALT).perform()
201
+ time.sleep(3)
202
+
203
 
204
  def webvoyager_run(args, task, task_dir):
205
  """
 
248
 
249
  action_key, info = extract_information(gpt_4v_res.split('Action:')[1].strip())
250
 
251
+ # ... (action execution logic)
252
+
253
  if action_key == 'answer':
254
  break
255