Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| STAT7-RAG Bridge: Realm-Agnostic Hybrid Scoring for Document Retrieval | |
| Bridges RAG documents with STAT7 addressing coordinates for intelligent, | |
| multi-dimensional hybrid scoring that combines semantic similarity with | |
| STAT7 entanglement resonance. | |
| Supports any realm type (game, system, faculty, pattern, data, business, concept, etc.) | |
| and scales deterministically to 10K+ documents. | |
| Author: The Seed Phase 1 Integration | |
| Status: Production-ready validation bridge | |
| """ | |
| from dataclasses import dataclass, field | |
| from typing import Dict, Any, List, Tuple, Optional | |
| import math | |
| import random | |
| # ============================================================================ | |
| # Data Structures: Realm-Agnostic STAT7 Addressing | |
| # ============================================================================ | |
| class Realm: | |
| """Flexible realm definition for any relationship domain.""" | |
| type: str # e.g. "game", "system", "faculty", "pattern", "data", "narrative", "business", "concept" | |
| label: str # human-readable name | |
| class STAT7Address: | |
| """ | |
| STAT7 coordinate system: 7 dimensions for unique, multidimensional addressing. | |
| - realm: Domain/context (flexible type + label) | |
| - lineage: Version/generation (int >= 0) | |
| - adjacency: Graph connectivity score (0.0-1.0) | |
| - horizon: Zoom level / lifecycle stage (logline, outline, scene, panel, etc.) | |
| - luminosity: Clarity/coherence/activity (0.0-1.0) | |
| - polarity: Tension/contrast/resonance (0.0-1.0) | |
| - dimensionality: Complexity/thread count (1-7 or bucketed) | |
| """ | |
| realm: Realm | |
| lineage: int | |
| adjacency: float | |
| horizon: str | |
| luminosity: float | |
| polarity: float | |
| dimensionality: int | |
| def __post_init__(self): | |
| """Validate STAT8 constraints.""" | |
| assert 0.0 <= self.adjacency <= 1.0, f"adjacency must be [0,1], got {self.adjacency}" | |
| assert 0.0 <= self.luminosity <= 1.0, f"luminosity must be [0,1], got {self.luminosity}" | |
| assert 0.0 <= self.polarity <= 1.0, f"polarity must be [0,1], got {self.polarity}" | |
| assert 0.0 <= self.entropy <= 1.0, f"entropy must be [0,1], got {self.entropy}" | |
| assert self.lineage >= 0, f"lineage must be >= 0, got {self.lineage}" | |
| assert ( | |
| 1 <= self.dimensionality <= 8 | |
| ), f"dimensionality must be [1,8], got {self.dimensionality}" | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Export as dictionary for serialization.""" | |
| return { | |
| "realm": {"type": self.realm.type, "label": self.realm.label}, | |
| "lineage": self.lineage, | |
| "adjacency": self.adjacency, | |
| "horizon": self.horizon, | |
| "luminosity": self.luminosity, | |
| "polarity": self.polarity, | |
| "dimensionality": self.dimensionality, | |
| "entropy": self.entropy, | |
| } | |
| class RAGDocument: | |
| """RAG document enhanced with STAT7 addressing.""" | |
| id: str | |
| text: str | |
| embedding: List[float] | |
| stat7: STAT7Address | |
| metadata: Dict[str, Any] = field(default_factory=dict) | |
| def __post_init__(self): | |
| """Validate document structure.""" | |
| assert len(self.embedding) > 0, f"embedding must not be empty for {self.id}" | |
| # ============================================================================ | |
| # Scoring Functions: Semantic + STAT7 Hybrid | |
| # ============================================================================ | |
| def cosine_similarity(a: List[float], b: List[float]) -> float: | |
| """ | |
| Compute cosine similarity between two embedding vectors. | |
| Range: [-1, 1], typically [0, 1] for normalized embeddings. | |
| """ | |
| if not a or not b: | |
| return 0.0 | |
| dot = sum(x * y for x, y in zip(a, b)) | |
| norm_a = math.sqrt(sum(x * x for x in a)) | |
| norm_b = math.sqrt(sum(y * y for y in b)) | |
| denom = norm_a * norm_b + 1e-12 # Avoid division by zero | |
| return dot / denom | |
| def stat7_resonance(query_stat7: STAT7Address, doc_stat7: STAT7Address) -> float: | |
| """ | |
| Compute STAT7 resonance between query and document addresses. | |
| This is the "entanglement score" — how well-aligned are the 7 dimensions? | |
| Scoring strategy: | |
| - Realm match (type > label): 1.0 if type matches, 0.85 if not; +0.1 if label matches | |
| - Horizon alignment: 1.0 if same, 0.9 if adjacent, 0.7 if different | |
| - Lineage proximity: decay by generation distance (±1 best) | |
| - Signal alignment: how close are luminosity/polarity? (0.0-1.0) | |
| - Adjacency/Dimensionality: connectivity and complexity bonuses | |
| Returns: [0.0, 1.0] resonance score | |
| """ | |
| # Realm match (type is primary, label is secondary boost) | |
| realm_score = 1.0 if query_stat7.realm.type == doc_stat7.realm.type else 0.85 | |
| if query_stat7.realm.label == doc_stat7.realm.label: | |
| realm_score += 0.1 | |
| realm_score = min(realm_score, 1.0) # Cap at 1.0 | |
| # Horizon alignment: scale by distance | |
| horizon_levels = {"logline": 1, "outline": 2, "scene": 3, "panel": 4} | |
| h_query = horizon_levels.get(query_stat7.horizon, 3) | |
| h_doc = horizon_levels.get(doc_stat7.horizon, 3) | |
| h_distance = abs(h_query - h_doc) | |
| if h_distance == 0: | |
| horizon_score = 1.0 | |
| elif h_distance == 1: | |
| horizon_score = 0.9 | |
| else: | |
| horizon_score = 0.7 | |
| # Lineage proximity: prefer ±0-1 generation distance | |
| lineage_distance = abs(query_stat7.lineage - doc_stat7.lineage) | |
| lineage_score = max(0.7, 1.0 - 0.05 * lineage_distance) | |
| # Signal alignment: luminosity + polarity + entropy | |
| luminosity_diff = abs(query_stat7.luminosity - doc_stat7.luminosity) | |
| polarity_diff = abs(query_stat7.polarity - doc_stat7.polarity) | |
| entropy_diff = abs(query_stat7.entropy - doc_stat7.entropy) | |
| signal_score = 1.0 - (1 / 3) * (luminosity_diff + polarity_diff + entropy_diff) | |
| signal_score = max(0.0, signal_score) | |
| # Adjacency/Dimensionality bonus: connectivity + complexity | |
| adj_bonus = doc_stat7.adjacency # Prefer well-connected docs | |
| dim_bonus = min(1.0, doc_stat7.dimensionality / 7.0) # Normalize to [0,1] | |
| adj_dim_score = 0.5 * adj_bonus + 0.5 * dim_bonus | |
| # Combine all scores (multiplicative for strict alignment, additive bonus | |
| # for complexity) | |
| resonance = realm_score * horizon_score * lineage_score * signal_score | |
| # 20% bonus from connectivity/complexity | |
| resonance *= 0.8 + 0.2 * adj_dim_score | |
| return max(0.0, min(resonance, 1.0)) # Clamp to [0,1] | |
| def hybrid_score( | |
| query_embedding: List[float], | |
| doc: RAGDocument, | |
| query_stat7: STAT7Address, | |
| weight_semantic: float = 0.6, | |
| weight_stat7: float = 0.4, | |
| ) -> float: | |
| """ | |
| Hybrid scoring: combine semantic similarity with STAT7 resonance. | |
| Args: | |
| query_embedding: Query embedding vector | |
| doc: RAG document with embedding and STAT7 address | |
| query_stat7: Query STAT7 address | |
| weight_semantic: Weight for semantic similarity (default 0.6) | |
| weight_stat7: Weight for STAT7 resonance (default 0.4) | |
| Returns: [0.0, 1.0] hybrid score | |
| """ | |
| assert weight_semantic + weight_stat7 == 1.0, "Weights must sum to 1.0" | |
| semantic_sim = cosine_similarity(query_embedding, doc.embedding) | |
| stat7_res = stat7_resonance(query_stat7, doc.stat7) | |
| hybrid = (weight_semantic * semantic_sim) + (weight_stat7 * stat7_res) | |
| return max(0.0, min(hybrid, 1.0)) # Clamp to [0,1] | |
| # ============================================================================ | |
| # Retrieval: Hybrid RAG Search | |
| # ============================================================================ | |
| def retrieve( | |
| documents: List[RAGDocument], | |
| query_embedding: List[float], | |
| query_stat7: STAT7Address, | |
| k: int = 10, | |
| weight_semantic: float = 0.6, | |
| weight_stat7: float = 0.4, | |
| ) -> List[Tuple[str, float]]: | |
| """ | |
| Retrieve top-k documents using hybrid (semantic + STAT7) scoring. | |
| Args: | |
| documents: List of RAG documents to search | |
| query_embedding: Query embedding vector | |
| query_stat7: Query STAT7 address | |
| k: Number of results to return | |
| weight_semantic: Weight for semantic similarity | |
| weight_stat7: Weight for STAT7 resonance | |
| Returns: List of (doc_id, hybrid_score) tuples, sorted by score (descending) | |
| """ | |
| scores = [] | |
| for doc in documents: | |
| score = hybrid_score(query_embedding, doc, query_stat7, weight_semantic, weight_stat7) | |
| scores.append((doc.id, score)) | |
| # Sort by score descending, return top-k | |
| return sorted(scores, key=lambda x: x[1], reverse=True)[:k] | |
| def retrieve_semantic_only( | |
| documents: List[RAGDocument], | |
| query_embedding: List[float], | |
| k: int = 10, | |
| ) -> List[Tuple[str, float]]: | |
| """ | |
| Retrieve top-k documents using semantic similarity only (baseline). | |
| Args: | |
| documents: List of RAG documents to search | |
| query_embedding: Query embedding vector | |
| k: Number of results to return | |
| Returns: List of (doc_id, semantic_score) tuples, sorted by score (descending) | |
| """ | |
| scores = [] | |
| for doc in documents: | |
| score = cosine_similarity(query_embedding, doc.embedding) | |
| scores.append((doc.id, score)) | |
| return sorted(scores, key=lambda x: x[1], reverse=True)[:k] | |
| # ============================================================================ | |
| # Utilities: Document Generation & STAT7 Randomization | |
| # ============================================================================ | |
| def generate_random_stat7_address( | |
| realm: Realm, | |
| lineage_range: Tuple[int, int] = (0, 10), | |
| horizon_choices: Optional[List[str]] = None, | |
| seed_offset: int = 0, | |
| ) -> STAT7Address: | |
| """ | |
| Generate a random STAT7 address with optional seeding. | |
| Args: | |
| realm: Realm for this address | |
| lineage_range: Min/max for lineage generation | |
| horizon_choices: List of horizon options (default: common levels) | |
| seed_offset: For reproducibility, offset from global random state | |
| Returns: Randomized STAT7Address | |
| """ | |
| if horizon_choices is None: | |
| horizon_choices = ["logline", "outline", "scene", "panel"] | |
| return STAT7Address( | |
| realm=realm, | |
| lineage=random.randint(lineage_range[0], lineage_range[1]), | |
| adjacency=round(random.random(), 2), | |
| horizon=random.choice(horizon_choices), | |
| luminosity=round(random.random(), 2), | |
| polarity=round(random.random(), 2), | |
| dimensionality=random.randint(1, 8), | |
| entropy=round(random.random(), 2), | |
| ) | |
| def generate_synthetic_rag_documents( | |
| base_texts: List[str], | |
| realm: Realm, | |
| scale: int, | |
| embedding_fn: callable, | |
| randomize_stat7: bool = False, | |
| seed: Optional[int] = None, | |
| ) -> List[RAGDocument]: | |
| """ | |
| Generate synthetic RAG documents with STAT7 addresses. | |
| Args: | |
| base_texts: List of base text templates (will be varied) | |
| realm: Realm for all generated documents | |
| scale: Number of documents to generate | |
| embedding_fn: Function to embed text (e.g., embedding_provider.embed_text) | |
| randomize_stat7: If True, randomize all 7 STAT7 dimensions per doc | |
| seed: Random seed for reproducibility | |
| Returns: List of RAGDocument with embeddings and STAT7 addresses | |
| """ | |
| if seed is not None: | |
| random.seed(seed) | |
| documents = [] | |
| for i in range(scale): | |
| # Vary text template | |
| base_idx = i % len(base_texts) | |
| base_text = base_texts[base_idx] | |
| text = f"[Context {i}] {base_text} (instance {i})" | |
| # Embed text | |
| embedding = embedding_fn(text) | |
| # Assign STAT7 address | |
| if randomize_stat7: | |
| stat7 = generate_random_stat7_address(realm, seed_offset=i) | |
| else: | |
| # Deterministic: map index to STAT7 dimensions | |
| stat7 = STAT7Address( | |
| realm=realm, | |
| lineage=i % 10, | |
| adjacency=round((i % 100) / 100.0, 2), | |
| horizon=["logline", "outline", "scene", "panel"][i % 4], | |
| luminosity=round((i % 10) / 10.0, 2), | |
| polarity=round(((i + 5) % 10) / 10.0, 2), | |
| dimensionality=1 + (i % 7), | |
| ) | |
| doc = RAGDocument( | |
| id=f"doc-{i:06d}", | |
| text=text, | |
| embedding=embedding, | |
| stat7=stat7, | |
| metadata={ | |
| "source": f"pack-{base_idx % 3}", | |
| "category": ["core", "wisdom", "politics"][base_idx % 3], | |
| "generated_index": i, | |
| }, | |
| ) | |
| documents.append(doc) | |
| return documents | |
| # ============================================================================ | |
| # Analysis: Comparison & Diagnostics | |
| # ============================================================================ | |
| def compare_retrieval_results( | |
| semantic_results: List[Tuple[str, float]], | |
| hybrid_results: List[Tuple[str, float]], | |
| k: int = 10, | |
| ) -> Dict[str, Any]: | |
| """ | |
| Compare semantic-only vs hybrid retrieval results. | |
| Returns metrics: | |
| - overlap: How many of top-k are shared? | |
| - semantic_avg_score: Average semantic score in top-k | |
| - hybrid_avg_score: Average hybrid score in top-k | |
| - reranking_distance: How much did hybrid rerank results? | |
| """ | |
| semantic_ids = {doc_id for doc_id, _ in semantic_results[:k]} | |
| hybrid_ids = {doc_id for doc_id, _ in hybrid_results[:k]} | |
| overlap = len(semantic_ids & hybrid_ids) | |
| overlap_pct = (overlap / k * 100) if k > 0 else 0.0 | |
| semantic_avg = sum(score for _, score in semantic_results[:k]) / k if k > 0 else 0.0 | |
| hybrid_avg = sum(score for _, score in hybrid_results[:k]) / k if k > 0 else 0.0 | |
| # Measure ranking distance: how far did top-k items move? | |
| semantic_rank = {doc_id: idx for idx, (doc_id, _) in enumerate(semantic_results[:k])} | |
| reranking_distances = [] | |
| for idx, (doc_id, _) in enumerate(hybrid_results[:k]): | |
| if doc_id in semantic_rank: | |
| distance = abs(idx - semantic_rank[doc_id]) | |
| reranking_distances.append(distance) | |
| avg_reranking_distance = ( | |
| sum(reranking_distances) / len(reranking_distances) if reranking_distances else 0.0 | |
| ) | |
| return { | |
| "overlap_count": overlap, | |
| "overlap_pct": overlap_pct, | |
| "semantic_avg_score": round(semantic_avg, 4), | |
| "hybrid_avg_score": round(hybrid_avg, 4), | |
| "score_improvement": round(hybrid_avg - semantic_avg, 4), | |
| "avg_reranking_distance": round(avg_reranking_distance, 2), | |
| } | |
| # ============================================================================ | |
| # STAT7RAGBridge: Wrapper for RetrievalAPI Integration | |
| # ============================================================================ | |
| class STAT7RAGBridge: | |
| """ | |
| Bridge class that provides STAT7 functionality for RetrievalAPI integration. | |
| Wraps the module-level STAT7 functions (stat7_resonance, hybrid_score, retrieve) | |
| to provide a consistent interface for the RetrievalAPI's hybrid scoring system. | |
| This allows RetrievalAPI to work with STAT7 coordinates seamlessly through | |
| dependency injection. | |
| """ | |
| def stat7_resonance(self, query_stat7: STAT7Address, doc_stat7: STAT7Address) -> float: | |
| """ | |
| Compute STAT7 resonance between query and document addresses. | |
| Args: | |
| query_stat7: Query STAT7 address | |
| doc_stat7: Document STAT7 address | |
| Returns: [0.0, 1.0] resonance score | |
| """ | |
| return stat7_resonance(query_stat7, doc_stat7) | |
| def hybrid_score( | |
| self, | |
| query_embedding: List[float], | |
| doc: RAGDocument, | |
| query_stat7: STAT7Address, | |
| weight_semantic: float = 0.6, | |
| weight_stat7: float = 0.4, | |
| ) -> float: | |
| """ | |
| Compute hybrid score combining semantic similarity with STAT7 resonance. | |
| Args: | |
| query_embedding: Query embedding vector | |
| doc: RAG document with embedding and STAT7 address | |
| query_stat7: Query STAT7 address | |
| weight_semantic: Weight for semantic similarity (default 0.6) | |
| weight_stat7: Weight for STAT7 resonance (default 0.4) | |
| Returns: [0.0, 1.0] hybrid score | |
| """ | |
| return hybrid_score(query_embedding, doc, query_stat7, weight_semantic, weight_stat7) | |
| def retrieve( | |
| self, | |
| documents: List[RAGDocument], | |
| query_embedding: List[float], | |
| query_stat7: STAT7Address, | |
| k: int = 10, | |
| weight_semantic: float = 0.6, | |
| weight_stat7: float = 0.4, | |
| ) -> List[Tuple[str, float]]: | |
| """ | |
| Retrieve top-k documents using hybrid (semantic + STAT7) scoring. | |
| Args: | |
| documents: List of RAG documents to search | |
| query_embedding: Query embedding vector | |
| query_stat7: Query STAT7 address | |
| k: Number of results to return | |
| weight_semantic: Weight for semantic similarity | |
| weight_stat7: Weight for STAT7 resonance | |
| Returns: List of (doc_id, hybrid_score) tuples, sorted by score (descending) | |
| """ | |
| return retrieve(documents, query_embedding, query_stat7, k, weight_semantic, weight_stat7) | |