ZhiyuanZeng commited on
Commit
c4bedee
·
1 Parent(s): 59b6e0f
Files changed (1) hide show
  1. server/RLVE_Gym_environment.py +24 -14
server/RLVE_Gym_environment.py CHANGED
@@ -33,7 +33,15 @@ class RlveGymEnvironment(Environment):
33
  answer_markers: Optional[Tuple[str, str]] = None,
34
  initial_seed: int = None,
35
  ):
36
- """Initialize the RLVE_Gym environment."""
 
 
 
 
 
 
 
 
37
 
38
  if environment_identifier is not None :
39
  self.environment_identifier = environment_identifier
@@ -69,10 +77,11 @@ class RlveGymEnvironment(Environment):
69
  Reset the environment.
70
 
71
  Returns:
72
- problem_input: The generated problem input string (or None if generation failed)
73
- verifier_result: None
74
- success: Boolean indicating whether the reset was successful
75
- message: The result of the reset
 
76
  """
77
  if (self.environment_identifier not in identifier2environment) or (
78
  self.environment_identifier not in identifier2controller
@@ -152,13 +161,15 @@ class RlveGymEnvironment(Environment):
152
  Execute a step in the environment by verifying the model output.
153
 
154
  Args:
155
- action: RlveGymAction containing the output to verify
 
156
 
157
  Returns:
158
- problem_input: The problem input string from the current state
159
- verifier_result: Result of the verification containing accuracy and other metrics
160
- success: Boolean indicating whether the step (verification) was successful
161
- message: The result of the step
 
162
  """
163
  if self.problem is None:
164
  return RlveGymObservation(
@@ -197,9 +208,8 @@ class RlveGymEnvironment(Environment):
197
  Get the current environment state.
198
 
199
  Returns:
200
- seed: The current random seed value for problem generation
201
- problem_input: The generated problem input string (or None if generation failed)
202
- num_samples: Number of samples taken so far
203
- sum_accuracy: Sum of accuracies from verifications so far
204
  """
205
  return self._state
 
33
  answer_markers: Optional[Tuple[str, str]] = None,
34
  initial_seed: int = None,
35
  ):
36
+ """
37
+ Initialize the RLVE_Gym environment.
38
+
39
+ Args:
40
+ environment_identifier (str): The environment's identifier. Check server/Gym/environments/__init__.py for detailed usage.
41
+ difficulty (int): The difficulty of generated problems.
42
+ answer_markers (Tuple[str] of length 2): How the environment extracts the final answer from a model output.
43
+ initial_seed (int): The initial seed to use when generating the first problem. Whenever reset() is called, the seed will be incremented by 1.
44
+ """
45
 
46
  if environment_identifier is not None :
47
  self.environment_identifier = environment_identifier
 
77
  Reset the environment.
78
 
79
  Returns:
80
+ problem_input (Optional[str]): The input of the problem; if it is None, it means that the problem generation has not been run or has failed.
81
+ verifier_result (Optional[dict]): Contains reward as the raw reward, accuracy as the 0/1 correctness, and format_score as the 0/1 format correctness; if it is None, it means that the verification has failed.
82
+ success (bool): True or False indicates whether the operation succeeded.
83
+ message (str): The explanation of success.
84
+ reward (Optional[float]): The value is verifier_result["reward"] when verifier_result is not None (otherwise, reward is also None).
85
  """
86
  if (self.environment_identifier not in identifier2environment) or (
87
  self.environment_identifier not in identifier2controller
 
161
  Execute a step in the environment by verifying the model output.
162
 
163
  Args:
164
+ action (RlveGymAction): Contains a single field:
165
+ - output (str): The model's output to get verified.
166
 
167
  Returns:
168
+ problem_input (Optional[str]): The input of the problem; if it is None, it means that the problem generation has not been run or has failed.
169
+ verifier_result (Optional[dict]): Contains reward as the raw reward, accuracy as the 0/1 correctness, and format_score as the 0/1 format correctness; if it is None, it means that the verification has failed.
170
+ success (bool): True or False indicates whether the operation succeeded.
171
+ message (str): The explanation of success.
172
+ reward (Optional[float]): The value is verifier_result["reward"] when verifier_result is not None (otherwise, reward is also None).
173
  """
174
  if self.problem is None:
175
  return RlveGymObservation(
 
208
  Get the current environment state.
209
 
210
  Returns:
211
+ seed (int): The seed to use when running reset().
212
+ problem_input (Optional[str]): The input of the problem; if it is None, it means that the problem generation has not been run, or it failed.
213
+ num_samples (int) and sum_accuracy (int): The statistics of the result of `step(action)` so far for the current problem (the number of outputs sent to the verifier and the number of correct ones).
 
214
  """
215
  return self._state