| { | |
| "visual_encoder": { | |
| "image_encoder": "Dinov2Small", | |
| "text_encoder": "t5-small", | |
| "fusion_xformer": { | |
| "num_layers": 3, | |
| "d_model": 512, | |
| "nhead": 8 | |
| }, | |
| "input_sensors": [ | |
| "raw_navigation_camera", | |
| "raw_manipulation_camera", | |
| "last_actions", | |
| "an_object_is_in_hand" | |
| ], | |
| "bbox_encoding_type": "positional" | |
| }, | |
| "visual_text_encoder_class": "TextCondMultiCameraVisualEncoder", | |
| "decoder": { | |
| "num_layers": 3, | |
| "d_model": 512, | |
| "nhead": 8 | |
| }, | |
| "num_actions": 20, | |
| "max_length": 1000, | |
| "action_loss": true, | |
| "use_llama_decoder": true, | |
| "_attn_implementation_autoset": false, | |
| "torch_dtype": "float32", | |
| "architectures": [ | |
| "EarlyFusionCnnTransformer" | |
| ], | |
| "model_type": "MM", | |
| "transformers_version": "4.49.0" | |
| } |