HuaminChen commited on
Commit
e737256
·
verified ·
1 Parent(s): 6445452

Make README fully standalone - no external repo needed

Browse files
Files changed (1) hide show
  1. README.md +62 -55
README.md CHANGED
@@ -85,23 +85,8 @@ pip install torch transformers pillow safetensors
85
  ### Load Model
86
 
87
  Two checkpoint formats are available:
88
- - `model.pt` (932 MB) - PyTorch format, smaller due to shared tensors
89
- - `model.safetensors` (1.35 GB) - SafeTensors format, recommended for production
90
-
91
- **Option 1: Using the source repository (full features)**
92
-
93
- ```bash
94
- git clone https://github.com/semantic-router/2DMSE-Multimodal-Embedder.git
95
- cd 2DMSE-Multimodal-Embedder
96
- pip install -e .
97
- ```
98
-
99
- ```python
100
- from src.models import MultimodalEmbedder
101
- model = MultimodalEmbedder.from_pretrained("llm-semantic-router/multi-modal-embed-small")
102
- ```
103
-
104
- **Option 2: Standalone with transformers (no repo needed)**
105
 
106
  ```python
107
  import torch
@@ -110,53 +95,75 @@ import torch.nn.functional as F
110
  from transformers import AutoModel, AutoTokenizer, SiglipModel, SiglipProcessor, WhisperModel, WhisperFeatureExtractor
111
  from huggingface_hub import hf_hub_download
112
 
113
- # Download weights
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  checkpoint_path = hf_hub_download(
115
  repo_id="llm-semantic-router/multi-modal-embed-small",
116
  filename="model.pt"
117
  )
118
  state_dict = torch.load(checkpoint_path, map_location="cpu")
119
 
120
- # Load text encoder
121
- text_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
122
- text_encoder = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
123
-
124
- # Load image encoder
125
- image_processor = SiglipProcessor.from_pretrained("google/siglip-base-patch16-512")
126
- image_encoder = SiglipModel.from_pretrained("google/siglip-base-patch16-512").vision_model
127
-
128
- # Load audio encoder
129
- audio_processor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
130
- audio_encoder = WhisperModel.from_pretrained("openai/whisper-tiny").encoder
131
-
132
- # Load trained projection weights from checkpoint
133
- # Text projection: state_dict keys starting with "text_encoder.projection"
134
- # Image projection: state_dict keys starting with "image_encoder.projection"
135
- # Audio projection: state_dict keys starting with "audio_encoder.projection"
136
-
137
- def encode_text(texts, tokenizer=text_tokenizer, encoder=text_encoder):
138
- inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
139
- with torch.no_grad():
140
- outputs = encoder(**inputs)
141
- embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
142
- embeddings = F.normalize(embeddings, p=2, dim=-1)
143
- return embeddings
144
-
145
- def encode_image(images, processor=image_processor, encoder=image_encoder):
146
- inputs = processor(images=images, return_tensors="pt")
147
- with torch.no_grad():
148
- outputs = encoder(inputs.pixel_values)
149
- embeddings = outputs.pooler_output
150
- embeddings = F.normalize(embeddings, p=2, dim=-1)
151
- return embeddings
152
 
153
- # Example usage
154
- text_emb = encode_text(["A photo of a cat"])
155
- print(f"Text embedding shape: {text_emb.shape}")
156
  ```
157
 
158
- > **Note**: Option 2 loads the base encoders but not the trained projection layers. For full model with trained weights, use Option 1.
159
-
160
  ### Text Embedding
161
 
162
  ```python
 
85
  ### Load Model
86
 
87
  Two checkpoint formats are available:
88
+ - `model.pt` (932 MB) - PyTorch format
89
+ - `model.safetensors` (1.35 GB) - SafeTensors format
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  ```python
92
  import torch
 
95
  from transformers import AutoModel, AutoTokenizer, SiglipModel, SiglipProcessor, WhisperModel, WhisperFeatureExtractor
96
  from huggingface_hub import hf_hub_download
97
 
98
+ class MultiModalEmbedder(nn.Module):
99
+ """Standalone multimodal embedder - no external dependencies."""
100
+
101
+ def __init__(self):
102
+ super().__init__()
103
+ # Text encoder
104
+ self.text_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
105
+ self.text_encoder = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
106
+ self.text_proj = nn.Linear(384, 384)
107
+
108
+ # Image encoder
109
+ self.image_processor = SiglipProcessor.from_pretrained("google/siglip-base-patch16-512")
110
+ self.image_encoder = SiglipModel.from_pretrained("google/siglip-base-patch16-512").vision_model
111
+ self.image_proj = nn.Linear(768, 384)
112
+
113
+ # Audio encoder
114
+ self.audio_processor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
115
+ self.audio_encoder = WhisperModel.from_pretrained("openai/whisper-tiny").encoder
116
+ self.audio_proj = nn.Linear(384, 384)
117
+
118
+ def encode_text(self, texts):
119
+ if isinstance(texts, str):
120
+ texts = [texts]
121
+ inputs = self.text_tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
122
+ inputs = {k: v.to(next(self.parameters()).device) for k, v in inputs.items()}
123
+ outputs = self.text_encoder(**inputs)
124
+ embeddings = outputs.last_hidden_state.mean(dim=1)
125
+ embeddings = self.text_proj(embeddings)
126
+ return F.normalize(embeddings, p=2, dim=-1)
127
+
128
+ def encode_image(self, images):
129
+ inputs = self.image_processor(images=images, return_tensors="pt")
130
+ inputs = {k: v.to(next(self.parameters()).device) for k, v in inputs.items()}
131
+ outputs = self.image_encoder(**inputs)
132
+ embeddings = outputs.pooler_output
133
+ embeddings = self.image_proj(embeddings)
134
+ return F.normalize(embeddings, p=2, dim=-1)
135
+
136
+ def encode_audio(self, waveform):
137
+ # waveform: [batch, samples] at 16kHz
138
+ inputs = self.audio_processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
139
+ inputs = {k: v.to(next(self.parameters()).device) for k, v in inputs.items()}
140
+ outputs = self.audio_encoder(**inputs)
141
+ embeddings = outputs.last_hidden_state.mean(dim=1)
142
+ embeddings = self.audio_proj(embeddings)
143
+ return F.normalize(embeddings, p=2, dim=-1)
144
+
145
+ # Load model
146
+ model = MultiModalEmbedder()
147
+
148
+ # Download and load trained weights
149
  checkpoint_path = hf_hub_download(
150
  repo_id="llm-semantic-router/multi-modal-embed-small",
151
  filename="model.pt"
152
  )
153
  state_dict = torch.load(checkpoint_path, map_location="cpu")
154
 
155
+ # Map checkpoint keys to our model
156
+ model.text_encoder.load_state_dict({k.replace("text_encoder.encoder.", ""): v for k, v in state_dict.items() if k.startswith("text_encoder.encoder.")})
157
+ model.text_proj.load_state_dict({k.replace("text_encoder.projection.", ""): v for k, v in state_dict.items() if k.startswith("text_encoder.projection.")})
158
+ model.image_encoder.load_state_dict({k.replace("image_encoder.vision_encoder.", ""): v for k, v in state_dict.items() if k.startswith("image_encoder.vision_encoder.")})
159
+ model.image_proj.load_state_dict({k.replace("image_encoder.projection.", ""): v for k, v in state_dict.items() if k.startswith("image_encoder.projection.")})
160
+ model.audio_encoder.load_state_dict({k.replace("audio_encoder.encoder.", ""): v for k, v in state_dict.items() if k.startswith("audio_encoder.encoder.")})
161
+ model.audio_proj.load_state_dict({k.replace("audio_encoder.projection.", ""): v for k, v in state_dict.items() if k.startswith("audio_encoder.projection.")})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ model.eval()
164
+ print("Model loaded successfully!")
 
165
  ```
166
 
 
 
167
  ### Text Embedding
168
 
169
  ```python