Spaces:
Running
Running
Commit
·
c4bedee
1
Parent(s):
59b6e0f
misc
Browse files- server/RLVE_Gym_environment.py +24 -14
server/RLVE_Gym_environment.py
CHANGED
|
@@ -33,7 +33,15 @@ class RlveGymEnvironment(Environment):
|
|
| 33 |
answer_markers: Optional[Tuple[str, str]] = None,
|
| 34 |
initial_seed: int = None,
|
| 35 |
):
|
| 36 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
if environment_identifier is not None :
|
| 39 |
self.environment_identifier = environment_identifier
|
|
@@ -69,10 +77,11 @@ class RlveGymEnvironment(Environment):
|
|
| 69 |
Reset the environment.
|
| 70 |
|
| 71 |
Returns:
|
| 72 |
-
problem_input: The
|
| 73 |
-
verifier_result: None
|
| 74 |
-
success:
|
| 75 |
-
message: The
|
|
|
|
| 76 |
"""
|
| 77 |
if (self.environment_identifier not in identifier2environment) or (
|
| 78 |
self.environment_identifier not in identifier2controller
|
|
@@ -152,13 +161,15 @@ class RlveGymEnvironment(Environment):
|
|
| 152 |
Execute a step in the environment by verifying the model output.
|
| 153 |
|
| 154 |
Args:
|
| 155 |
-
action
|
|
|
|
| 156 |
|
| 157 |
Returns:
|
| 158 |
-
problem_input: The problem
|
| 159 |
-
verifier_result:
|
| 160 |
-
success:
|
| 161 |
-
message: The
|
|
|
|
| 162 |
"""
|
| 163 |
if self.problem is None:
|
| 164 |
return RlveGymObservation(
|
|
@@ -197,9 +208,8 @@ class RlveGymEnvironment(Environment):
|
|
| 197 |
Get the current environment state.
|
| 198 |
|
| 199 |
Returns:
|
| 200 |
-
seed: The
|
| 201 |
-
problem_input: The
|
| 202 |
-
num_samples:
|
| 203 |
-
sum_accuracy: Sum of accuracies from verifications so far
|
| 204 |
"""
|
| 205 |
return self._state
|
|
|
|
| 33 |
answer_markers: Optional[Tuple[str, str]] = None,
|
| 34 |
initial_seed: int = None,
|
| 35 |
):
|
| 36 |
+
"""
|
| 37 |
+
Initialize the RLVE_Gym environment.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
environment_identifier (str): The environment's identifier. Check server/Gym/environments/__init__.py for detailed usage.
|
| 41 |
+
difficulty (int): The difficulty of generated problems.
|
| 42 |
+
answer_markers (Tuple[str] of length 2): How the environment extracts the final answer from a model output.
|
| 43 |
+
initial_seed (int): The initial seed to use when generating the first problem. Whenever reset() is called, the seed will be incremented by 1.
|
| 44 |
+
"""
|
| 45 |
|
| 46 |
if environment_identifier is not None :
|
| 47 |
self.environment_identifier = environment_identifier
|
|
|
|
| 77 |
Reset the environment.
|
| 78 |
|
| 79 |
Returns:
|
| 80 |
+
problem_input (Optional[str]): The input of the problem; if it is None, it means that the problem generation has not been run or has failed.
|
| 81 |
+
verifier_result (Optional[dict]): Contains reward as the raw reward, accuracy as the 0/1 correctness, and format_score as the 0/1 format correctness; if it is None, it means that the verification has failed.
|
| 82 |
+
success (bool): True or False indicates whether the operation succeeded.
|
| 83 |
+
message (str): The explanation of success.
|
| 84 |
+
reward (Optional[float]): The value is verifier_result["reward"] when verifier_result is not None (otherwise, reward is also None).
|
| 85 |
"""
|
| 86 |
if (self.environment_identifier not in identifier2environment) or (
|
| 87 |
self.environment_identifier not in identifier2controller
|
|
|
|
| 161 |
Execute a step in the environment by verifying the model output.
|
| 162 |
|
| 163 |
Args:
|
| 164 |
+
action (RlveGymAction): Contains a single field:
|
| 165 |
+
- output (str): The model's output to get verified.
|
| 166 |
|
| 167 |
Returns:
|
| 168 |
+
problem_input (Optional[str]): The input of the problem; if it is None, it means that the problem generation has not been run or has failed.
|
| 169 |
+
verifier_result (Optional[dict]): Contains reward as the raw reward, accuracy as the 0/1 correctness, and format_score as the 0/1 format correctness; if it is None, it means that the verification has failed.
|
| 170 |
+
success (bool): True or False indicates whether the operation succeeded.
|
| 171 |
+
message (str): The explanation of success.
|
| 172 |
+
reward (Optional[float]): The value is verifier_result["reward"] when verifier_result is not None (otherwise, reward is also None).
|
| 173 |
"""
|
| 174 |
if self.problem is None:
|
| 175 |
return RlveGymObservation(
|
|
|
|
| 208 |
Get the current environment state.
|
| 209 |
|
| 210 |
Returns:
|
| 211 |
+
seed (int): The seed to use when running reset().
|
| 212 |
+
problem_input (Optional[str]): The input of the problem; if it is None, it means that the problem generation has not been run, or it failed.
|
| 213 |
+
num_samples (int) and sum_accuracy (int): The statistics of the result of `step(action)` so far for the current problem (the number of outputs sent to the verifier and the number of correct ones).
|
|
|
|
| 214 |
"""
|
| 215 |
return self._state
|