Spaces:

Tonic
/

l-operator-demo

Running on Zero

App Files Files Community

Joseph Pollack commited on Aug 29

Commit

709ae40

unverified ·

1 Parent(s): ed0bc6e

better input processing and outputs string

Browse files

Files changed (1) hide show

app.py +25 -58

app.py CHANGED Viewed

@@ -70,7 +70,7 @@ class LOperatorDemo:
     @spaces.GPU(duration=120)  # 2 minutes for action generation
     def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str:
-        """Generate action based on image and text inputs"""
         if not self.is_loaded:
             return "❌ Model not loaded. Please load the model first."
@@ -79,7 +79,13 @@ class LOperatorDemo:
             if image.mode != "RGB":
                 image = image.convert("RGB")
-            # Build conversation
             conversation = [
                 {
                     "role": "system",
@@ -91,71 +97,32 @@ class LOperatorDemo:
                     "role": "user",
                     "content": [
                         {"type": "image", "image": image},
-                        {"type": "text", "text": f"Goal: {goal}\nStep: {instruction}\nRespond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."}
                     ]
                 }
             ]
             logger.info("Processing conversation with processor...")
-            # Process inputs with better error handling
-            try:
-                inputs = self.processor.apply_chat_template(
-                    conversation,
-                    add_generation_prompt=True,
-                    return_tensors="pt"
-                )
-                logger.info(f"Processor output type: {type(inputs)}")
-                # If processor returns a string, just return it directly
-                if isinstance(inputs, str):
-                    logger.info("Processor returned string, returning directly...")
-                    return inputs
-                # Handle other return types
-                if isinstance(inputs, dict):
-                    # If processor returns a dict, extract input_ids
-                    logger.info("Processor returned dict, extracting input_ids...")
-                    inputs = inputs["input_ids"]
-                elif not isinstance(inputs, torch.Tensor):
-                    logger.warning("apply_chat_template did not return a tensor, attempting to convert...")
-                    if isinstance(inputs, (list, tuple)):
-                        inputs = torch.tensor(inputs)
-                    else:
-                        # If it's an unexpected type, return the string directly
-                        logger.warning(f"Unexpected input type: {type(inputs)}, returning as string")
-                        return str(inputs)
-                inputs = inputs.to(self.model.device)
-                logger.info(f"Inputs shape: {inputs.shape}, device: {inputs.device}")
-            except Exception as e:
-                logger.error(f"Error in processor: {str(e)}")
-                return f"❌ Error in processor: {str(e)}"
-            # Generate response
-            logger.info("Generating response...")
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    inputs,
-                    max_new_tokens=128,
-                    do_sample=True,
-                    temperature=0.7,
-                    top_p=0.9
-                )
-            logger.info("Decoding response...")
-            response = self.processor.tokenizer.decode(
-                outputs[0][inputs.shape[1]:],
-                skip_special_tokens=True
-            )
-            # Try to parse as JSON for better formatting
-            try:
-                parsed_response = json.loads(response)
-                return json.dumps(parsed_response, indent=2)
-            except:
-                return response
         except Exception as e:
             logger.error(f"Error generating action: {str(e)}")

     @spaces.GPU(duration=120)  # 2 minutes for action generation
     def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str:
+        """Generate action based on image and text inputs using the same format as training"""
         if not self.is_loaded:
             return "❌ Model not loaded. Please load the model first."
             if image.mode != "RGB":
                 image = image.convert("RGB")
+            # Build conversation using the EXACT same format as training
+            user_text = (
+                f"Goal: {goal}\n"
+                f"Step: {instruction}\n"
+                "Respond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."
+            )
             conversation = [
                 {
                     "role": "system",
                     "role": "user",
                     "content": [
                         {"type": "image", "image": image},
+                        {"type": "text", "text": user_text}
                     ]
                 }
             ]
             logger.info("Processing conversation with processor...")
+            # Process inputs using the same method as training
+            inputs = self.processor.apply_chat_template(
+                conversation,
+                add_generation_prompt=True,
+                return_tensors="pt",
+                return_dict=True,
+                tokenize=True,
+            )
+            logger.info(f"Processor output type: {type(inputs)}")
+            # If processor returns a string, just return it directly
+            if isinstance(inputs, str):
+                logger.info("Processor returned string, returning directly...")
+                return inputs
+            # If it's a dict or other type, convert to string and return
+            logger.info("Converting processor output to string...")
+            return str(inputs)
         except Exception as e:
             logger.error(f"Error generating action: {str(e)}")