warbler-cda / warbler_cda /stat7_rag_bridge.py
Bellok
refactor(app): improve code formatting and add background ingestion status display
ec38897
raw
history blame
17.1 kB
"""
STAT7-RAG Bridge: Realm-Agnostic Hybrid Scoring for Document Retrieval
Bridges RAG documents with STAT7 addressing coordinates for intelligent,
multi-dimensional hybrid scoring that combines semantic similarity with
STAT7 entanglement resonance.
Supports any realm type (game, system, faculty, pattern, data, business, concept, etc.)
and scales deterministically to 10K+ documents.
Author: The Seed Phase 1 Integration
Status: Production-ready validation bridge
"""
from dataclasses import dataclass, field
from typing import Dict, Any, List, Tuple, Optional
import math
import random
# ============================================================================
# Data Structures: Realm-Agnostic STAT7 Addressing
# ============================================================================
@dataclass
class Realm:
"""Flexible realm definition for any relationship domain."""
type: str # e.g. "game", "system", "faculty", "pattern", "data", "narrative", "business", "concept"
label: str # human-readable name
@dataclass
class STAT7Address:
"""
STAT7 coordinate system: 7 dimensions for unique, multidimensional addressing.
- realm: Domain/context (flexible type + label)
- lineage: Version/generation (int >= 0)
- adjacency: Graph connectivity score (0.0-1.0)
- horizon: Zoom level / lifecycle stage (logline, outline, scene, panel, etc.)
- luminosity: Clarity/coherence/activity (0.0-1.0)
- polarity: Tension/contrast/resonance (0.0-1.0)
- dimensionality: Complexity/thread count (1-7 or bucketed)
"""
realm: Realm
lineage: int
adjacency: float
horizon: str
luminosity: float
polarity: float
dimensionality: int
def __post_init__(self):
"""Validate STAT8 constraints."""
assert 0.0 <= self.adjacency <= 1.0, f"adjacency must be [0,1], got {self.adjacency}"
assert 0.0 <= self.luminosity <= 1.0, f"luminosity must be [0,1], got {self.luminosity}"
assert 0.0 <= self.polarity <= 1.0, f"polarity must be [0,1], got {self.polarity}"
assert 0.0 <= self.entropy <= 1.0, f"entropy must be [0,1], got {self.entropy}"
assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}"
assert (
1 <= self.dimensionality <= 8
), f"dimensionality must be [1,8], got {self.dimensionality}"
def to_dict(self) -> Dict[str, Any]:
"""Export as dictionary for serialization."""
return {
"realm": {"type": self.realm.type, "label": self.realm.label},
"lineage": self.lineage,
"adjacency": self.adjacency,
"horizon": self.horizon,
"luminosity": self.luminosity,
"polarity": self.polarity,
"dimensionality": self.dimensionality,
"entropy": self.entropy,
}
@dataclass
class RAGDocument:
"""RAG document enhanced with STAT7 addressing."""
id: str
text: str
embedding: List[float]
stat7: STAT7Address
metadata: Dict[str, Any] = field(default_factory=dict)
def __post_init__(self):
"""Validate document structure."""
assert len(self.embedding) > 0, f"embedding must not be empty for {self.id}"
# ============================================================================
# Scoring Functions: Semantic + STAT7 Hybrid
# ============================================================================
def cosine_similarity(a: List[float], b: List[float]) -> float:
"""
Compute cosine similarity between two embedding vectors.
Range: [-1, 1], typically [0, 1] for normalized embeddings.
"""
if not a or not b:
return 0.0
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(y * y for y in b))
denom = norm_a * norm_b + 1e-12 # Avoid division by zero
return dot / denom
def stat7_resonance(query_stat7: STAT7Address, doc_stat7: STAT7Address) -> float:
"""
Compute STAT7 resonance between query and document addresses.
This is the "entanglement score" — how well-aligned are the 7 dimensions?
Scoring strategy:
- Realm match (type > label): 1.0 if type matches, 0.85 if not; +0.1 if label matches
- Horizon alignment: 1.0 if same, 0.9 if adjacent, 0.7 if different
- Lineage proximity: decay by generation distance (±1 best)
- Signal alignment: how close are luminosity/polarity? (0.0-1.0)
- Adjacency/Dimensionality: connectivity and complexity bonuses
Returns: [0.0, 1.0] resonance score
"""
# Realm match (type is primary, label is secondary boost)
realm_score = 1.0 if query_stat7.realm.type == doc_stat7.realm.type else 0.85
if query_stat7.realm.label == doc_stat7.realm.label:
realm_score += 0.1
realm_score = min(realm_score, 1.0) # Cap at 1.0
# Horizon alignment: scale by distance
horizon_levels = {"logline": 1, "outline": 2, "scene": 3, "panel": 4}
h_query = horizon_levels.get(query_stat7.horizon, 3)
h_doc = horizon_levels.get(doc_stat7.horizon, 3)
h_distance = abs(h_query - h_doc)
if h_distance == 0:
horizon_score = 1.0
elif h_distance == 1:
horizon_score = 0.9
else:
horizon_score = 0.7
# Lineage proximity: prefer ±0-1 generation distance
lineage_distance = abs(query_stat7.lineage - doc_stat7.lineage)
lineage_score = max(0.7, 1.0 - 0.05 * lineage_distance)
# Signal alignment: luminosity + polarity + entropy
luminosity_diff = abs(query_stat7.luminosity - doc_stat7.luminosity)
polarity_diff = abs(query_stat7.polarity - doc_stat7.polarity)
entropy_diff = abs(query_stat7.entropy - doc_stat7.entropy)
signal_score = 1.0 - (1 / 3) * (luminosity_diff + polarity_diff + entropy_diff)
signal_score = max(0.0, signal_score)
# Adjacency/Dimensionality bonus: connectivity + complexity
adj_bonus = doc_stat7.adjacency # Prefer well-connected docs
dim_bonus = min(1.0, doc_stat7.dimensionality / 7.0) # Normalize to [0,1]
adj_dim_score = 0.5 * adj_bonus + 0.5 * dim_bonus
# Combine all scores (multiplicative for strict alignment, additive bonus
# for complexity)
resonance = realm_score * horizon_score * lineage_score * signal_score
# 20% bonus from connectivity/complexity
resonance *= 0.8 + 0.2 * adj_dim_score
return max(0.0, min(resonance, 1.0)) # Clamp to [0,1]
def hybrid_score(
query_embedding: List[float],
doc: RAGDocument,
query_stat7: STAT7Address,
weight_semantic: float = 0.6,
weight_stat7: float = 0.4,
) -> float:
"""
Hybrid scoring: combine semantic similarity with STAT7 resonance.
Args:
query_embedding: Query embedding vector
doc: RAG document with embedding and STAT7 address
query_stat7: Query STAT7 address
weight_semantic: Weight for semantic similarity (default 0.6)
weight_stat7: Weight for STAT7 resonance (default 0.4)
Returns: [0.0, 1.0] hybrid score
"""
assert weight_semantic + weight_stat7 == 1.0, "Weights must sum to 1.0"
semantic_sim = cosine_similarity(query_embedding, doc.embedding)
stat7_res = stat7_resonance(query_stat7, doc.stat7)
hybrid = (weight_semantic * semantic_sim) + (weight_stat7 * stat7_res)
return max(0.0, min(hybrid, 1.0)) # Clamp to [0,1]
# ============================================================================
# Retrieval: Hybrid RAG Search
# ============================================================================
def retrieve(
documents: List[RAGDocument],
query_embedding: List[float],
query_stat7: STAT7Address,
k: int = 10,
weight_semantic: float = 0.6,
weight_stat7: float = 0.4,
) -> List[Tuple[str, float]]:
"""
Retrieve top-k documents using hybrid (semantic + STAT7) scoring.
Args:
documents: List of RAG documents to search
query_embedding: Query embedding vector
query_stat7: Query STAT7 address
k: Number of results to return
weight_semantic: Weight for semantic similarity
weight_stat7: Weight for STAT7 resonance
Returns: List of (doc_id, hybrid_score) tuples, sorted by score (descending)
"""
scores = []
for doc in documents:
score = hybrid_score(query_embedding, doc, query_stat7, weight_semantic, weight_stat7)
scores.append((doc.id, score))
# Sort by score descending, return top-k
return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
def retrieve_semantic_only(
documents: List[RAGDocument],
query_embedding: List[float],
k: int = 10,
) -> List[Tuple[str, float]]:
"""
Retrieve top-k documents using semantic similarity only (baseline).
Args:
documents: List of RAG documents to search
query_embedding: Query embedding vector
k: Number of results to return
Returns: List of (doc_id, semantic_score) tuples, sorted by score (descending)
"""
scores = []
for doc in documents:
score = cosine_similarity(query_embedding, doc.embedding)
scores.append((doc.id, score))
return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
# ============================================================================
# Utilities: Document Generation & STAT7 Randomization
# ============================================================================
def generate_random_stat7_address(
realm: Realm,
lineage_range: Tuple[int, int] = (0, 10),
horizon_choices: Optional[List[str]] = None,
seed_offset: int = 0,
) -> STAT7Address:
"""
Generate a random STAT7 address with optional seeding.
Args:
realm: Realm for this address
lineage_range: Min/max for lineage generation
horizon_choices: List of horizon options (default: common levels)
seed_offset: For reproducibility, offset from global random state
Returns: Randomized STAT7Address
"""
if horizon_choices is None:
horizon_choices = ["logline", "outline", "scene", "panel"]
return STAT7Address(
realm=realm,
lineage=random.randint(lineage_range[0], lineage_range[1]),
adjacency=round(random.random(), 2),
horizon=random.choice(horizon_choices),
luminosity=round(random.random(), 2),
polarity=round(random.random(), 2),
dimensionality=random.randint(1, 8),
entropy=round(random.random(), 2),
)
def generate_synthetic_rag_documents(
base_texts: List[str],
realm: Realm,
scale: int,
embedding_fn: callable,
randomize_stat7: bool = False,
seed: Optional[int] = None,
) -> List[RAGDocument]:
"""
Generate synthetic RAG documents with STAT7 addresses.
Args:
base_texts: List of base text templates (will be varied)
realm: Realm for all generated documents
scale: Number of documents to generate
embedding_fn: Function to embed text (e.g., embedding_provider.embed_text)
randomize_stat7: If True, randomize all 7 STAT7 dimensions per doc
seed: Random seed for reproducibility
Returns: List of RAGDocument with embeddings and STAT7 addresses
"""
if seed is not None:
random.seed(seed)
documents = []
for i in range(scale):
# Vary text template
base_idx = i % len(base_texts)
base_text = base_texts[base_idx]
text = f"[Context {i}] {base_text} (instance {i})"
# Embed text
embedding = embedding_fn(text)
# Assign STAT7 address
if randomize_stat7:
stat7 = generate_random_stat7_address(realm, seed_offset=i)
else:
# Deterministic: map index to STAT7 dimensions
stat7 = STAT7Address(
realm=realm,
lineage=i % 10,
adjacency=round((i % 100) / 100.0, 2),
horizon=["logline", "outline", "scene", "panel"][i % 4],
luminosity=round((i % 10) / 10.0, 2),
polarity=round(((i + 5) % 10) / 10.0, 2),
dimensionality=1 + (i % 7),
)
doc = RAGDocument(
id=f"doc-{i:06d}",
text=text,
embedding=embedding,
stat7=stat7,
metadata={
"source": f"pack-{base_idx % 3}",
"category": ["core", "wisdom", "politics"][base_idx % 3],
"generated_index": i,
},
)
documents.append(doc)
return documents
# ============================================================================
# Analysis: Comparison & Diagnostics
# ============================================================================
def compare_retrieval_results(
semantic_results: List[Tuple[str, float]],
hybrid_results: List[Tuple[str, float]],
k: int = 10,
) -> Dict[str, Any]:
"""
Compare semantic-only vs hybrid retrieval results.
Returns metrics:
- overlap: How many of top-k are shared?
- semantic_avg_score: Average semantic score in top-k
- hybrid_avg_score: Average hybrid score in top-k
- reranking_distance: How much did hybrid rerank results?
"""
semantic_ids = {doc_id for doc_id, _ in semantic_results[:k]}
hybrid_ids = {doc_id for doc_id, _ in hybrid_results[:k]}
overlap = len(semantic_ids & hybrid_ids)
overlap_pct = (overlap / k * 100) if k > 0 else 0.0
semantic_avg = sum(score for _, score in semantic_results[:k]) / k if k > 0 else 0.0
hybrid_avg = sum(score for _, score in hybrid_results[:k]) / k if k > 0 else 0.0
# Measure ranking distance: how far did top-k items move?
semantic_rank = {doc_id: idx for idx, (doc_id, _) in enumerate(semantic_results[:k])}
reranking_distances = []
for idx, (doc_id, _) in enumerate(hybrid_results[:k]):
if doc_id in semantic_rank:
distance = abs(idx - semantic_rank[doc_id])
reranking_distances.append(distance)
avg_reranking_distance = (
sum(reranking_distances) / len(reranking_distances) if reranking_distances else 0.0
)
return {
"overlap_count": overlap,
"overlap_pct": overlap_pct,
"semantic_avg_score": round(semantic_avg, 4),
"hybrid_avg_score": round(hybrid_avg, 4),
"score_improvement": round(hybrid_avg - semantic_avg, 4),
"avg_reranking_distance": round(avg_reranking_distance, 2),
}
# ============================================================================
# STAT7RAGBridge: Wrapper for RetrievalAPI Integration
# ============================================================================
class STAT7RAGBridge:
"""
Bridge class that provides STAT7 functionality for RetrievalAPI integration.
Wraps the module-level STAT7 functions (stat7_resonance, hybrid_score, retrieve)
to provide a consistent interface for the RetrievalAPI's hybrid scoring system.
This allows RetrievalAPI to work with STAT7 coordinates seamlessly through
dependency injection.
"""
def stat7_resonance(self, query_stat7: STAT7Address, doc_stat7: STAT7Address) -> float:
"""
Compute STAT7 resonance between query and document addresses.
Args:
query_stat7: Query STAT7 address
doc_stat7: Document STAT7 address
Returns: [0.0, 1.0] resonance score
"""
return stat7_resonance(query_stat7, doc_stat7)
def hybrid_score(
self,
query_embedding: List[float],
doc: RAGDocument,
query_stat7: STAT7Address,
weight_semantic: float = 0.6,
weight_stat7: float = 0.4,
) -> float:
"""
Compute hybrid score combining semantic similarity with STAT7 resonance.
Args:
query_embedding: Query embedding vector
doc: RAG document with embedding and STAT7 address
query_stat7: Query STAT7 address
weight_semantic: Weight for semantic similarity (default 0.6)
weight_stat7: Weight for STAT7 resonance (default 0.4)
Returns: [0.0, 1.0] hybrid score
"""
return hybrid_score(query_embedding, doc, query_stat7, weight_semantic, weight_stat7)
def retrieve(
self,
documents: List[RAGDocument],
query_embedding: List[float],
query_stat7: STAT7Address,
k: int = 10,
weight_semantic: float = 0.6,
weight_stat7: float = 0.4,
) -> List[Tuple[str, float]]:
"""
Retrieve top-k documents using hybrid (semantic + STAT7) scoring.
Args:
documents: List of RAG documents to search
query_embedding: Query embedding vector
query_stat7: Query STAT7 address
k: Number of results to return
weight_semantic: Weight for semantic similarity
weight_stat7: Weight for STAT7 resonance
Returns: List of (doc_id, hybrid_score) tuples, sorted by score (descending)
"""
return retrieve(documents, query_embedding, query_stat7, k, weight_semantic, weight_stat7)