Spaces:
Running
on
Zero
Running
on
Zero
Bellok
commited on
Commit
·
82f450e
1
Parent(s):
382376e
feat(app/docker): integrate committed packs and optimize embedding provider
Browse filesConfigure embedding provider with 4 workers, batch size 64, and caching for better throughout. Prioritize loading local committed packs (faction-politics, wisdom-scrolls) over HuggingFace downloads, suppress progress bars, and copy packs into Dockerfile. Update READMEs to fix dataset name references. Reduces startup time and ensures essential packs are always available locally.
- Dockerfile +5 -0
- app.py +25 -5
- packs/warbler-pack-core/README_HF_DATASET.md +1 -1
- packs/warbler-pack-faction-politics/README_HF_DATASET.md +1 -1
- packs/{warbler-pack-hf-npc-dialogue → warbler-pack-npc-dialogue}/package.json +3 -3
- packs/warbler-pack-npc-dialogue/warbler-pack-npc-dialogue.jsonl +30 -0
- packs/warbler-pack-wisdom-scrolls/README_HF_DATASET.md +1 -1
- warbler_cda/embeddings/sentence_transformer_provider.py +87 -4
Dockerfile
CHANGED
|
@@ -21,6 +21,11 @@ RUN pip install --no-cache-dir -r requirements.txt \
|
|
| 21 |
# Copy the warbler_cda module
|
| 22 |
COPY warbler_cda/ ./warbler_cda/
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
# Copy server startup script
|
| 25 |
COPY start_server.py ./
|
| 26 |
|
|
|
|
| 21 |
# Copy the warbler_cda module
|
| 22 |
COPY warbler_cda/ ./warbler_cda/
|
| 23 |
|
| 24 |
+
# Copy committed packs that stay in repo (per .gitignore)
|
| 25 |
+
COPY packs/warbler-pack-core/ ./packs/warbler-pack-core/
|
| 26 |
+
COPY packs/warbler-pack-faction-politics/ ./packs/warbler-pack-faction-politics/
|
| 27 |
+
COPY packs/warbler-pack-wisdom-scrolls/ ./packs/warbler-pack-wisdom-scrolls/
|
| 28 |
+
|
| 29 |
# Copy server startup script
|
| 30 |
COPY start_server.py ./
|
| 31 |
|
app.py
CHANGED
|
@@ -26,8 +26,12 @@ from warbler_cda.pack_loader import PackLoader
|
|
| 26 |
print("🚀 Initializing Warbler CDA...")
|
| 27 |
|
| 28 |
# Initialize the system components
|
| 29 |
-
print("⚙️ Creating embedding provider...")
|
| 30 |
-
embedding_provider = EmbeddingProviderFactory.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
print(f"✅ Embedding provider: {embedding_provider.get_provider_info()['provider_id']}")
|
| 32 |
|
| 33 |
print("⚙️ Initializing semantic anchors...")
|
|
@@ -56,15 +60,31 @@ documents = pack_loader.discover_documents()
|
|
| 56 |
if len(documents) == 0:
|
| 57 |
print("⚠️ No packs found locally. Attempting to download from HuggingFace...")
|
| 58 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
from warbler_cda.utils.hf_warbler_ingest import HFWarblerIngestor
|
| 60 |
ingestor = HFWarblerIngestor(packs_dir=pack_loader.packs_dir, verbose=True)
|
| 61 |
-
#
|
|
|
|
|
|
|
|
|
|
| 62 |
datasets_to_download = [
|
| 63 |
"arxiv-1", "arxiv-2", "arxiv-3", "arxiv-4", "arxiv-5", # First 5 arxiv chunks
|
| 64 |
-
"novels", "manuals", "enterprise", "edustories", "npc-dialogue",
|
| 65 |
-
"
|
| 66 |
]
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
total_docs = 0
|
| 69 |
successful_downloads = 0
|
| 70 |
|
|
|
|
| 26 |
print("🚀 Initializing Warbler CDA...")
|
| 27 |
|
| 28 |
# Initialize the system components
|
| 29 |
+
print("⚙️ Creating embedding provider with 4 workers...")
|
| 30 |
+
embedding_provider = EmbeddingProviderFactory.create_provider("sentence_transformer", {
|
| 31 |
+
"num_workers": 4,
|
| 32 |
+
"batch_size": 64, # Larger batches for better throughput
|
| 33 |
+
"cache_dir": ".embedding_cache"
|
| 34 |
+
})
|
| 35 |
print(f"✅ Embedding provider: {embedding_provider.get_provider_info()['provider_id']}")
|
| 36 |
|
| 37 |
print("⚙️ Initializing semantic anchors...")
|
|
|
|
| 60 |
if len(documents) == 0:
|
| 61 |
print("⚠️ No packs found locally. Attempting to download from HuggingFace...")
|
| 62 |
try:
|
| 63 |
+
# Suppress HF datasets progress bars for cleaner output
|
| 64 |
+
import os
|
| 65 |
+
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
| 66 |
+
|
| 67 |
from warbler_cda.utils.hf_warbler_ingest import HFWarblerIngestor
|
| 68 |
ingestor = HFWarblerIngestor(packs_dir=pack_loader.packs_dir, verbose=True)
|
| 69 |
+
# First, try to load packs that are committed to the repo (not HF datasets)
|
| 70 |
+
local_only_packs = ["warbler-pack-faction-politics", "warbler-pack-wisdom-scrolls"]
|
| 71 |
+
|
| 72 |
+
# Then download HF datasets
|
| 73 |
datasets_to_download = [
|
| 74 |
"arxiv-1", "arxiv-2", "arxiv-3", "arxiv-4", "arxiv-5", # First 5 arxiv chunks
|
| 75 |
+
"novels", "manuals", "enterprise", "edustories", "npc-dialogue",
|
| 76 |
+
"portuguese-edu", "prompt-report"
|
| 77 |
]
|
| 78 |
|
| 79 |
+
# Check if local packs exist and should be loaded
|
| 80 |
+
local_pack_count = 0
|
| 81 |
+
for pack_name in local_only_packs:
|
| 82 |
+
pack_path = pack_loader.packs_dir / pack_name
|
| 83 |
+
if pack_path.exists():
|
| 84 |
+
print(f"📚 Loading committed pack: {pack_name}")
|
| 85 |
+
# These are already in the repo, so they should be discoverable
|
| 86 |
+
local_pack_count += 1
|
| 87 |
+
|
| 88 |
total_docs = 0
|
| 89 |
successful_downloads = 0
|
| 90 |
|
packs/warbler-pack-core/README_HF_DATASET.md
CHANGED
|
@@ -70,7 +70,7 @@ Part of **Warbler CDA** (Cognitive Development Architecture) - a production-read
|
|
| 70 |
|
| 71 |
- [warbler-pack-faction-politics](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-faction-politics) - Political intrigue templates
|
| 72 |
- [warbler-pack-wisdom-scrolls](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-wisdom-scrolls) - Wisdom generation templates
|
| 73 |
-
- [warbler-pack-
|
| 74 |
|
| 75 |
## License
|
| 76 |
|
|
|
|
| 70 |
|
| 71 |
- [warbler-pack-faction-politics](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-faction-politics) - Political intrigue templates
|
| 72 |
- [warbler-pack-wisdom-scrolls](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-wisdom-scrolls) - Wisdom generation templates
|
| 73 |
+
- [warbler-pack-npc-dialogue](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-npc-dialogue) - NPC dialogue from HuggingFace sources
|
| 74 |
|
| 75 |
## License
|
| 76 |
|
packs/warbler-pack-faction-politics/README_HF_DATASET.md
CHANGED
|
@@ -81,7 +81,7 @@ Part of **Warbler CDA** (Cognitive Development Architecture) - a production-read
|
|
| 81 |
|
| 82 |
- [warbler-pack-core](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-core) - Core conversation templates
|
| 83 |
- [warbler-pack-wisdom-scrolls](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-wisdom-scrolls) - Wisdom generation templates
|
| 84 |
-
- [warbler-pack-
|
| 85 |
|
| 86 |
## License
|
| 87 |
|
|
|
|
| 81 |
|
| 82 |
- [warbler-pack-core](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-core) - Core conversation templates
|
| 83 |
- [warbler-pack-wisdom-scrolls](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-wisdom-scrolls) - Wisdom generation templates
|
| 84 |
+
- [warbler-pack-npc-dialogue](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-npc-dialogue) - NPC dialogue from HuggingFace sources
|
| 85 |
|
| 86 |
## License
|
| 87 |
|
packs/{warbler-pack-hf-npc-dialogue → warbler-pack-npc-dialogue}/package.json
RENAMED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"name": "warbler-pack-
|
| 3 |
"version": "1.0.0",
|
| 4 |
-
"description": "Warbler pack generated
|
| 5 |
"created_at": "2025-11-19T21:24:41.170415",
|
| 6 |
"document_count": 1000,
|
| 7 |
-
"source": "
|
| 8 |
"content_types": [
|
| 9 |
"character_interaction"
|
| 10 |
],
|
|
|
|
| 1 |
{
|
| 2 |
+
"name": "warbler-pack-npc-dialogue",
|
| 3 |
"version": "1.0.0",
|
| 4 |
+
"description": "Warbler pack generated by Copilot. Contains NPC dialogue data for in-game characters.",
|
| 5 |
"created_at": "2025-11-19T21:24:41.170415",
|
| 6 |
"document_count": 1000,
|
| 7 |
+
"source": "Warbler CDA Ingestor",
|
| 8 |
"content_types": [
|
| 9 |
"character_interaction"
|
| 10 |
],
|
packs/warbler-pack-npc-dialogue/warbler-pack-npc-dialogue.jsonl
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"content":"The weather has been quite unusual lately in {{location}}. Some say it’s a sign of change.","content_id":"warbler-pack-core/flavor_weather","metadata":{"dialogue_type":"template","title":"Weather Flavor","description":"NPC comments on local weather conditions","pack":"warbler-pack-core","type":"template"}}
|
| 2 |
+
{"content":"Have you heard the latest rumors? They say something strange is happening near the {{landmark}}.","content_id":"warbler-pack-core/rumor_hook","metadata":{"dialogue_type":"template","title":"Rumor Hook","description":"NPC shares a rumor to spark curiosity","pack":"warbler-pack-core","type":"template"}}
|
| 3 |
+
{"content":"I’ve lived in {{location}} my whole life. It has its charms, but also its secrets.","content_id":"warbler-pack-core/local_lore","metadata":{"dialogue_type":"template","title":"Local Lore","description":"NPC shares a bit of local history or mystery","pack":"warbler-pack-core","type":"template"}}
|
| 4 |
+
{"content":"You look like you’ve traveled far, {{user_name}}. Care to share a tale from your journey?","content_id":"warbler-pack-core/traveler_prompt","metadata":{"dialogue_type":"template","title":"Traveler Prompt","description":"NPC invites the player to share their story","pack":"warbler-pack-core","type":"template"}}
|
| 5 |
+
{"content":"Be careful out there. The roads beyond {{location}} aren’t as safe as they used to be.","content_id":"warbler-pack-core/warning_general","metadata":{"dialogue_type":"template","title":"General Warning","description":"NPC warns the player about dangers ahead","pack":"warbler-pack-core","type":"template"}}
|
| 6 |
+
{"content":"Sometimes I wonder what lies beyond the {{location_type}}. Perhaps adventure, perhaps danger.","content_id":"warbler-pack-core/philosophical_idle","metadata":{"dialogue_type":"template","title":"Philosophical Idle","description":"NPC muses idly about the world","pack":"warbler-pack-core","type":"template"}}
|
| 7 |
+
{"content":"If you’re looking for work, I heard {{npc_name}} is seeking help with a task.","content_id":"warbler-pack-core/quest_hint","metadata":{"dialogue_type":"template","title":"Quest Hint","description":"NPC hints at a possible quest giver","pack":"warbler-pack-core","type":"template"}}
|
| 8 |
+
{"content":"Oh, pardon me! I didn’t mean to bump into you. These streets get crowded at {{time_of_day}}.","content_id":"warbler-pack-core/incidental_encounter","metadata":{"dialogue_type":"template","title":"Incidental Encounter","description":"NPC apologizes or reacts to a casual collision","pack":"warbler-pack-core","type":"template"}}
|
| 9 |
+
{"content":"I’m just passing the time. Not much happens around here, but it’s peaceful.","content_id":"warbler-pack-core/idle_chatter","metadata":{"dialogue_type":"template","title":"Idle Chatter","description":"NPC filler dialogue for downtime moments","pack":"warbler-pack-core","type":"template"}}
|
| 10 |
+
{"content":"Strange noises have been coming from the {{location_type}} at night. Gives me chills just thinking about it.","content_id":"warbler-pack-core/spooky_flavor","metadata":{"dialogue_type":"template","title":"Spooky Flavor","description":"NPC shares eerie environmental detail","pack":"warbler-pack-core","type":"template"}}
|
| 11 |
+
{"content":"The market in {{location}} has been bustling lately. Merchants from far and wide bring their wares.","content_id":"warbler-pack-core/flavor_market","metadata":{"dialogue_type":"template","title":"Market Flavor","description":"NPC comments on the busy marketplace","pack":"warbler-pack-core","type":"template"}}
|
| 12 |
+
{"content":"I can’t shake the feeling that someone’s been watching the {{location_type}} at night.","content_id":"warbler-pack-core/suspicion_idle","metadata":{"dialogue_type":"template","title":"Suspicious Idle","description":"NPC expresses unease about strange happenings","pack":"warbler-pack-core","type":"template"}}
|
| 13 |
+
{"content":"You remind me of someone I once knew, {{user_name}}. Strong spirit, determined eyes.","content_id":"warbler-pack-core/personal_connection","metadata":{"dialogue_type":"template","title":"Personal Connection","description":"NPC draws a personal parallel with the player","pack":"warbler-pack-core","type":"template"}}
|
| 14 |
+
{"content":"If you’re seeking adventure, the {{landmark}} is said to hold treasures and dangers alike.","content_id":"warbler-pack-core/adventure_hook","metadata":{"dialogue_type":"template","title":"Adventure Hook","description":"NPC hints at exploration opportunities","pack":"warbler-pack-core","type":"template"}}
|
| 15 |
+
{"content":"The children of {{location}} love to play near the {{landmark}}. It’s a joyful sight.","content_id":"warbler-pack-core/flavor_children","metadata":{"dialogue_type":"template","title":"Children Flavor","description":"NPC shares a wholesome local detail","pack":"warbler-pack-core","type":"template"}}
|
| 16 |
+
{"content":"I’ve seen many travelers pass through here, but few with your confidence, {{user_title}}.","content_id":"warbler-pack-core/traveler_observation","metadata":{"dialogue_type":"template","title":"Traveler Observation","description":"NPC remarks on the player’s presence","pack":"warbler-pack-core","type":"template"}}
|
| 17 |
+
{"content":"The {{npc_role}} has been looking for capable hands. Perhaps you should pay them a visit.","content_id":"warbler-pack-core/quest_pointer","metadata":{"dialogue_type":"template","title":"Quest Pointer","description":"NPC directs player toward a quest giver","pack":"warbler-pack-core","type":"template"}}
|
| 18 |
+
{"content":"Ah, the smell of fresh bread from the bakery in {{location}} always lifts my spirits.","content_id":"warbler-pack-core/flavor_food","metadata":{"dialogue_type":"template","title":"Food Flavor","description":"NPC comments on local food or drink","pack":"warbler-pack-core","type":"template"}}
|
| 19 |
+
{"content":"Stay vigilant, {{user_name}}. Trouble often hides where you least expect it.","content_id":"warbler-pack-core/general_encouragement","metadata":{"dialogue_type":"template","title":"General Encouragement","description":"NPC offers encouragement with a warning","pack":"warbler-pack-core","type":"template"}}
|
| 20 |
+
{"content":"I’ve heard songs sung about heroes like you. Perhaps one day, they’ll sing of your deeds.","content_id":"warbler-pack-core/heroic_flavor","metadata":{"dialogue_type":"template","title":"Heroic Flavor","description":"NPC elevates the player with mythic resonance","pack":"warbler-pack-core","type":"template"}}
|
| 21 |
+
{"content":"Care to test your skills? I’ve sparred with many, and I’d welcome the challenge.","content_id":"warbler-pack-core/combat_challenge","metadata":{"dialogue_type":"template","title":"Combat Challenge","description":"NPC invites the player to spar or fight","pack":"warbler-pack-core","type":"template"}}
|
| 22 |
+
{"content":"These roads are long and lonely. A bit of conversation makes the journey lighter, don’t you think?","content_id":"warbler-pack-core/travel_flavor","metadata":{"dialogue_type":"template","title":"Travel Flavor","description":"NPC shares a thought about travel","pack":"warbler-pack-core","type":"template"}}
|
| 23 |
+
{"content":"The {{location_type}} has been quiet lately. Too quiet, if you ask me.","content_id":"warbler-pack-core/flavor_quiet","metadata":{"dialogue_type":"template","title":"Quiet Flavor","description":"NPC comments on eerie silence","pack":"warbler-pack-core","type":"template"}}
|
| 24 |
+
{"content":"I once dreamed of leaving {{location}} behind, but life had other plans.","content_id":"warbler-pack-core/personal_idle","metadata":{"dialogue_type":"template","title":"Personal Idle","description":"NPC shares a personal reflection","pack":"warbler-pack-core","type":"template"}}
|
| 25 |
+
{"content":"If you’re trading, I’ve got rare goods from {{region}}. Interested?","content_id":"warbler-pack-core/trade_offer","metadata":{"dialogue_type":"template","title":"Trade Offer","description":"NPC offers specific trade goods","pack":"warbler-pack-core","type":"template"}}
|
| 26 |
+
{"content":"Legends say the {{landmark}} was built by hands not of this world.","content_id":"warbler-pack-core/legend_flavor","metadata":{"dialogue_type":"template","title":"Legend Flavor","description":"NPC shares a myth or legend","pack":"warbler-pack-core","type":"template"}}
|
| 27 |
+
{"content":"I’m just enjoying the sunshine. Days like this remind me why I love {{location}}.","content_id":"warbler-pack-core/flavor_sunshine","metadata":{"dialogue_type":"template","title":"Sunshine Flavor","description":"NPC comments on pleasant weather","pack":"warbler-pack-core","type":"template"}}
|
| 28 |
+
{"content":"You seem burdened, {{user_name}}. Is there something weighing on your mind?","content_id":"warbler-pack-core/empathy_prompt","metadata":{"dialogue_type":"template","title":"Empathy Prompt","description":"NPC expresses concern for the player","pack":"warbler-pack-core","type":"template"}}
|
| 29 |
+
{"content":"The bells of {{location}} ring at dawn and dusk. It’s a tradition we hold dear.","content_id":"warbler-pack-core/flavor_tradition","metadata":{"dialogue_type":"template","title":"Tradition Flavor","description":"NPC shares a cultural detail","pack":"warbler-pack-core","type":"template"}}
|
| 30 |
+
{"content":"I’m not much for adventure, but I admire those who seek it.","content_id":"warbler-pack-core/admiration_idle","metadata":{"dialogue_type":"template","title":"Admiration Idle","description":"NPC expresses admiration for adventurers","pack":"warbler-pack-core","type":"template"}}
|
packs/warbler-pack-wisdom-scrolls/README_HF_DATASET.md
CHANGED
|
@@ -110,7 +110,7 @@ Part of **Warbler CDA** (Cognitive Development Architecture) and the **Living De
|
|
| 110 |
|
| 111 |
- [warbler-pack-core](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-core) - Core conversation templates
|
| 112 |
- [warbler-pack-faction-politics](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-faction-politics) - Political dialogue templates
|
| 113 |
-
- [warbler-pack-
|
| 114 |
|
| 115 |
## License
|
| 116 |
|
|
|
|
| 110 |
|
| 111 |
- [warbler-pack-core](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-core) - Core conversation templates
|
| 112 |
- [warbler-pack-faction-politics](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-faction-politics) - Political dialogue templates
|
| 113 |
+
- [warbler-pack-npc-dialogue](https://huggingface.co/datasets/tiny-walnut-games/warbler-pack-npc-dialogue) - NPC dialogue from HuggingFace sources
|
| 114 |
|
| 115 |
## License
|
| 116 |
|
warbler_cda/embeddings/sentence_transformer_provider.py
CHANGED
|
@@ -33,6 +33,8 @@ class SentenceTransformerEmbeddingProvider(EmbeddingProvider):
|
|
| 33 |
else model_name_default
|
| 34 |
)
|
| 35 |
self.batch_size: int = config.get("batch_size", 32) if config else 32
|
|
|
|
|
|
|
| 36 |
cache_dir_default = ".embedding_cache"
|
| 37 |
self.cache_dir: str = (
|
| 38 |
config.get("cache_dir", cache_dir_default) if config else cache_dir_default
|
|
@@ -94,7 +96,7 @@ class SentenceTransformerEmbeddingProvider(EmbeddingProvider):
|
|
| 94 |
def embed_batch(
|
| 95 |
self, texts: List[str], show_progress: bool = False
|
| 96 |
) -> List[List[float]]:
|
| 97 |
-
"""Generate embeddings for multiple texts with batching and
|
| 98 |
# Check model initialization first, before processing
|
| 99 |
if texts and self.model is None:
|
| 100 |
raise RuntimeError("Model not initialized. Call _initialize_model first.")
|
|
@@ -125,11 +127,12 @@ class SentenceTransformerEmbeddingProvider(EmbeddingProvider):
|
|
| 125 |
raise ValueError("Model is not an instance of SentenceTransformer")
|
| 126 |
elif SentenceTransformer is None:
|
| 127 |
raise RuntimeError("SentenceTransformer not available but model is set")
|
| 128 |
-
|
|
|
|
|
|
|
| 129 |
texts_to_embed,
|
| 130 |
batch_size=self.batch_size,
|
| 131 |
-
|
| 132 |
-
show_progress_bar=show_progress,
|
| 133 |
)
|
| 134 |
|
| 135 |
for idx, batch_idx in enumerate(indices_to_embed):
|
|
@@ -150,6 +153,86 @@ class SentenceTransformerEmbeddingProvider(EmbeddingProvider):
|
|
| 150 |
|
| 151 |
return result
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
def semantic_search(
|
| 154 |
self, query_text: str, embeddings: List[List[float]], top_k: int = 5
|
| 155 |
) -> List[Tuple[int, float]]:
|
|
|
|
| 33 |
else model_name_default
|
| 34 |
)
|
| 35 |
self.batch_size: int = config.get("batch_size", 32) if config else 32
|
| 36 |
+
# Add worker configuration for parallelism
|
| 37 |
+
self.num_workers: int = config.get("num_workers", 4) if config else 4
|
| 38 |
cache_dir_default = ".embedding_cache"
|
| 39 |
self.cache_dir: str = (
|
| 40 |
config.get("cache_dir", cache_dir_default) if config else cache_dir_default
|
|
|
|
| 96 |
def embed_batch(
|
| 97 |
self, texts: List[str], show_progress: bool = False
|
| 98 |
) -> List[List[float]]:
|
| 99 |
+
"""Generate embeddings for multiple texts with batching, caching, and multi-worker parallelism."""
|
| 100 |
# Check model initialization first, before processing
|
| 101 |
if texts and self.model is None:
|
| 102 |
raise RuntimeError("Model not initialized. Call _initialize_model first.")
|
|
|
|
| 127 |
raise ValueError("Model is not an instance of SentenceTransformer")
|
| 128 |
elif SentenceTransformer is None:
|
| 129 |
raise RuntimeError("SentenceTransformer not available but model is set")
|
| 130 |
+
|
| 131 |
+
# Use multi-worker processing for large batches
|
| 132 |
+
batch_embeddings: Any = self._encode_with_workers(
|
| 133 |
texts_to_embed,
|
| 134 |
batch_size=self.batch_size,
|
| 135 |
+
show_progress=show_progress
|
|
|
|
| 136 |
)
|
| 137 |
|
| 138 |
for idx, batch_idx in enumerate(indices_to_embed):
|
|
|
|
| 153 |
|
| 154 |
return result
|
| 155 |
|
| 156 |
+
def _encode_with_workers(self, texts: List[str], batch_size: int = 32, show_progress: bool = False) -> Any:
|
| 157 |
+
"""Encode texts using multiple workers when beneficial."""
|
| 158 |
+
total_texts = len(texts)
|
| 159 |
+
|
| 160 |
+
# Only use multi-worker processing for large batches to avoid overhead
|
| 161 |
+
if total_texts < 100 or self.num_workers == 1:
|
| 162 |
+
# Use standard single-threaded encoding for small batches
|
| 163 |
+
return self.model.encode(
|
| 164 |
+
texts,
|
| 165 |
+
batch_size=batch_size,
|
| 166 |
+
convert_to_tensor=False,
|
| 167 |
+
show_progress_bar=show_progress,
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# Split texts into chunks for parallel processing
|
| 171 |
+
import threading
|
| 172 |
+
import queue
|
| 173 |
+
|
| 174 |
+
chunk_size = max(1, total_texts // self.num_workers)
|
| 175 |
+
text_chunks = [texts[i:i + chunk_size] for i in range(0, total_texts, chunk_size)]
|
| 176 |
+
|
| 177 |
+
results_queue: queue.Queue = queue.Queue()
|
| 178 |
+
embeddings_results = [None] * len(text_chunks)
|
| 179 |
+
|
| 180 |
+
def worker_encode(chunk_idx: int, chunk_texts: List[str]):
|
| 181 |
+
"""Worker function for encoding text chunks."""
|
| 182 |
+
try:
|
| 183 |
+
chunk_embeddings = self.model.encode(
|
| 184 |
+
chunk_texts,
|
| 185 |
+
batch_size=batch_size,
|
| 186 |
+
convert_to_tensor=False,
|
| 187 |
+
show_progress_bar=False, # Disable individual progress bars
|
| 188 |
+
)
|
| 189 |
+
results_queue.put((chunk_idx, chunk_embeddings))
|
| 190 |
+
except Exception as e:
|
| 191 |
+
results_queue.put((chunk_idx, e))
|
| 192 |
+
|
| 193 |
+
# Start worker threads
|
| 194 |
+
threads = []
|
| 195 |
+
for i, chunk in enumerate(text_chunks):
|
| 196 |
+
thread = threading.Thread(
|
| 197 |
+
target=worker_encode,
|
| 198 |
+
args=(i, chunk),
|
| 199 |
+
daemon=True
|
| 200 |
+
)
|
| 201 |
+
threads.append(thread)
|
| 202 |
+
thread.start()
|
| 203 |
+
|
| 204 |
+
# Collect results
|
| 205 |
+
completed_workers = 0
|
| 206 |
+
while completed_workers < len(text_chunks):
|
| 207 |
+
try:
|
| 208 |
+
chunk_idx, result = results_queue.get(timeout=1.0)
|
| 209 |
+
if isinstance(result, Exception):
|
| 210 |
+
raise result
|
| 211 |
+
embeddings_results[chunk_idx] = result
|
| 212 |
+
completed_workers += 1
|
| 213 |
+
|
| 214 |
+
if show_progress:
|
| 215 |
+
print(f"Worker {chunk_idx + 1}/{len(text_chunks)} completed ({completed_workers}/{len(text_chunks)})")
|
| 216 |
+
|
| 217 |
+
except queue.Empty:
|
| 218 |
+
# Check if all threads are still alive
|
| 219 |
+
if not any(t.is_alive() for t in threads):
|
| 220 |
+
break
|
| 221 |
+
continue
|
| 222 |
+
|
| 223 |
+
# Wait for all threads to complete
|
| 224 |
+
for thread in threads:
|
| 225 |
+
thread.join()
|
| 226 |
+
|
| 227 |
+
# Combine results in original order
|
| 228 |
+
final_embeddings = []
|
| 229 |
+
for embeddings in embeddings_results:
|
| 230 |
+
if embeddings is None:
|
| 231 |
+
raise RuntimeError("Worker thread failed to complete")
|
| 232 |
+
final_embeddings.extend(embeddings)
|
| 233 |
+
|
| 234 |
+
return final_embeddings
|
| 235 |
+
|
| 236 |
def semantic_search(
|
| 237 |
self, query_text: str, embeddings: List[List[float]], top_k: int = 5
|
| 238 |
) -> List[Tuple[int, float]]:
|