llm-semantic-router
/

multi-modal-embed-small

@@ -84,7 +84,11 @@ pip install torch transformers pillow safetensors
 ### Load Model
-First, clone the repository and install:
 ```bash
 git clone https://github.com/semantic-router/2DMSE-Multimodal-Embedder.git
@@ -92,50 +96,67 @@ cd 2DMSE-Multimodal-Embedder
 pip install -e .
 ```
-Two checkpoint formats are available:
-- `model.pt` (932 MB) - PyTorch format, smaller due to shared tensors
-- `model.safetensors` (1.35 GB) - SafeTensors format, recommended for production
 ```python
 import torch
-import json
 from huggingface_hub import hf_hub_download
-from src.models import MultimodalEmbedder
-# Download checkpoint and config
 checkpoint_path = hf_hub_download(
     repo_id="llm-semantic-router/multi-modal-embed-small",
-    filename="model.pt"  # or "model.safetensors"
-)
-config_path = hf_hub_download(
-    repo_id="llm-semantic-router/multi-modal-embed-small",
-    filename="config.json"
 )
-# Load config and create model
-with open(config_path) as f:
-    config = json.load(f)
-model = MultimodalEmbedder(
-    text_encoder_name=config["text_encoder_name"],
-    image_encoder_name=config["image_encoder_name"],
-    audio_encoder_name=config["audio_encoder_name"],
-    output_dim=config["output_dim"],
-    fusion_type=config["fusion_type"],
-    num_fusion_layers=config["num_fusion_layers"],
-)
-# Load weights (works with both .pt and .safetensors)
-if checkpoint_path.endswith(".safetensors"):
-    from safetensors.torch import load_file
-    state_dict = load_file(checkpoint_path)
-else:
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-model.load_state_dict(state_dict)
-model.eval()
 ```
 ### Text Embedding
 ```python

 ### Load Model
+Two checkpoint formats are available:
+- `model.pt` (932 MB) - PyTorch format, smaller due to shared tensors
+- `model.safetensors` (1.35 GB) - SafeTensors format, recommended for production
+**Option 1: Using the source repository (full features)**
 ```bash
 git clone https://github.com/semantic-router/2DMSE-Multimodal-Embedder.git
 pip install -e .
 ```
+```python
+from src.models import MultimodalEmbedder
+model = MultimodalEmbedder.from_pretrained("llm-semantic-router/multi-modal-embed-small")
+```
+**Option 2: Standalone with transformers (no repo needed)**
 ```python
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModel, AutoTokenizer, SiglipModel, SiglipProcessor, WhisperModel, WhisperFeatureExtractor
 from huggingface_hub import hf_hub_download
+# Download weights
 checkpoint_path = hf_hub_download(
     repo_id="llm-semantic-router/multi-modal-embed-small",
+    filename="model.pt"
 )
+state_dict = torch.load(checkpoint_path, map_location="cpu")
+# Load text encoder
+text_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+text_encoder = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+# Load image encoder
+image_processor = SiglipProcessor.from_pretrained("google/siglip-base-patch16-512")
+image_encoder = SiglipModel.from_pretrained("google/siglip-base-patch16-512").vision_model
+# Load audio encoder
+audio_processor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
+audio_encoder = WhisperModel.from_pretrained("openai/whisper-tiny").encoder
+# Load trained projection weights from checkpoint
+# Text projection: state_dict keys starting with "text_encoder.projection"
+# Image projection: state_dict keys starting with "image_encoder.projection"
+# Audio projection: state_dict keys starting with "audio_encoder.projection"
+def encode_text(texts, tokenizer=text_tokenizer, encoder=text_encoder):
+    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+    with torch.no_grad():
+        outputs = encoder(**inputs)
+        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
+        embeddings = F.normalize(embeddings, p=2, dim=-1)
+    return embeddings
+def encode_image(images, processor=image_processor, encoder=image_encoder):
+    inputs = processor(images=images, return_tensors="pt")
+    with torch.no_grad():
+        outputs = encoder(inputs.pixel_values)
+        embeddings = outputs.pooler_output
+        embeddings = F.normalize(embeddings, p=2, dim=-1)
+    return embeddings
+# Example usage
+text_emb = encode_text(["A photo of a cat"])
+print(f"Text embedding shape: {text_emb.shape}")
 ```
+> **Note**: Option 2 loads the base encoders but not the trained projection layers. For full model with trained weights, use Option 1.
 ### Text Embedding
 ```python