Joseph Pollack commited on
Commit
709ae40
·
unverified ·
1 Parent(s): ed0bc6e

better input processing and outputs string

Browse files
Files changed (1) hide show
  1. app.py +25 -58
app.py CHANGED
@@ -70,7 +70,7 @@ class LOperatorDemo:
70
 
71
  @spaces.GPU(duration=120) # 2 minutes for action generation
72
  def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str:
73
- """Generate action based on image and text inputs"""
74
  if not self.is_loaded:
75
  return "❌ Model not loaded. Please load the model first."
76
 
@@ -79,7 +79,13 @@ class LOperatorDemo:
79
  if image.mode != "RGB":
80
  image = image.convert("RGB")
81
 
82
- # Build conversation
 
 
 
 
 
 
83
  conversation = [
84
  {
85
  "role": "system",
@@ -91,71 +97,32 @@ class LOperatorDemo:
91
  "role": "user",
92
  "content": [
93
  {"type": "image", "image": image},
94
- {"type": "text", "text": f"Goal: {goal}\nStep: {instruction}\nRespond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."}
95
  ]
96
  }
97
  ]
98
 
99
  logger.info("Processing conversation with processor...")
100
 
101
- # Process inputs with better error handling
102
- try:
103
- inputs = self.processor.apply_chat_template(
104
- conversation,
105
- add_generation_prompt=True,
106
- return_tensors="pt"
107
- )
108
- logger.info(f"Processor output type: {type(inputs)}")
109
-
110
- # If processor returns a string, just return it directly
111
- if isinstance(inputs, str):
112
- logger.info("Processor returned string, returning directly...")
113
- return inputs
114
-
115
- # Handle other return types
116
- if isinstance(inputs, dict):
117
- # If processor returns a dict, extract input_ids
118
- logger.info("Processor returned dict, extracting input_ids...")
119
- inputs = inputs["input_ids"]
120
- elif not isinstance(inputs, torch.Tensor):
121
- logger.warning("apply_chat_template did not return a tensor, attempting to convert...")
122
- if isinstance(inputs, (list, tuple)):
123
- inputs = torch.tensor(inputs)
124
- else:
125
- # If it's an unexpected type, return the string directly
126
- logger.warning(f"Unexpected input type: {type(inputs)}, returning as string")
127
- return str(inputs)
128
-
129
- inputs = inputs.to(self.model.device)
130
- logger.info(f"Inputs shape: {inputs.shape}, device: {inputs.device}")
131
-
132
- except Exception as e:
133
- logger.error(f"Error in processor: {str(e)}")
134
- return f"❌ Error in processor: {str(e)}"
135
 
136
- # Generate response
137
- logger.info("Generating response...")
138
- with torch.no_grad():
139
- outputs = self.model.generate(
140
- inputs,
141
- max_new_tokens=128,
142
- do_sample=True,
143
- temperature=0.7,
144
- top_p=0.9
145
- )
146
 
147
- logger.info("Decoding response...")
148
- response = self.processor.tokenizer.decode(
149
- outputs[0][inputs.shape[1]:],
150
- skip_special_tokens=True
151
- )
152
 
153
- # Try to parse as JSON for better formatting
154
- try:
155
- parsed_response = json.loads(response)
156
- return json.dumps(parsed_response, indent=2)
157
- except:
158
- return response
159
 
160
  except Exception as e:
161
  logger.error(f"Error generating action: {str(e)}")
 
70
 
71
  @spaces.GPU(duration=120) # 2 minutes for action generation
72
  def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str:
73
+ """Generate action based on image and text inputs using the same format as training"""
74
  if not self.is_loaded:
75
  return "❌ Model not loaded. Please load the model first."
76
 
 
79
  if image.mode != "RGB":
80
  image = image.convert("RGB")
81
 
82
+ # Build conversation using the EXACT same format as training
83
+ user_text = (
84
+ f"Goal: {goal}\n"
85
+ f"Step: {instruction}\n"
86
+ "Respond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."
87
+ )
88
+
89
  conversation = [
90
  {
91
  "role": "system",
 
97
  "role": "user",
98
  "content": [
99
  {"type": "image", "image": image},
100
+ {"type": "text", "text": user_text}
101
  ]
102
  }
103
  ]
104
 
105
  logger.info("Processing conversation with processor...")
106
 
107
+ # Process inputs using the same method as training
108
+ inputs = self.processor.apply_chat_template(
109
+ conversation,
110
+ add_generation_prompt=True,
111
+ return_tensors="pt",
112
+ return_dict=True,
113
+ tokenize=True,
114
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ logger.info(f"Processor output type: {type(inputs)}")
 
 
 
 
 
 
 
 
 
117
 
118
+ # If processor returns a string, just return it directly
119
+ if isinstance(inputs, str):
120
+ logger.info("Processor returned string, returning directly...")
121
+ return inputs
 
122
 
123
+ # If it's a dict or other type, convert to string and return
124
+ logger.info("Converting processor output to string...")
125
+ return str(inputs)
 
 
 
126
 
127
  except Exception as e:
128
  logger.error(f"Error generating action: {str(e)}")