MnemoCore / tests /test_binary_hdv.py
Granis87's picture
Initial upload of MnemoCore
dbb04e4 verified
"""
HAIM Test Suite β€” Binary HDV Tests
===================================
Tests for the core BinaryHDV operations (Phase 3.0).
Validates mathematical properties of VSA operations.
"""
import numpy as np
import pytest
from mnemocore.core.binary_hdv import (
BinaryHDV,
TextEncoder,
batch_hamming_distance,
majority_bundle,
top_k_nearest,
)
# Default test dimension (smaller for speed)
D = 1024
class TestBinaryHDVConstruction:
def test_random_creates_valid_vector(self):
v = BinaryHDV.random(D)
assert v.dimension == D
assert v.data.shape == (D // 8,)
assert v.data.dtype == np.uint8
def test_zeros(self):
v = BinaryHDV.zeros(D)
assert np.all(v.data == 0)
def test_ones(self):
v = BinaryHDV.ones(D)
assert np.all(v.data == 0xFF)
def test_from_seed_deterministic(self):
v1 = BinaryHDV.from_seed("hello", D)
v2 = BinaryHDV.from_seed("hello", D)
assert v1 == v2
def test_different_seeds_different_vectors(self):
v1 = BinaryHDV.from_seed("hello", D)
v2 = BinaryHDV.from_seed("world", D)
assert v1 != v2
def test_dimension_must_be_multiple_of_8(self):
with pytest.raises(AssertionError):
BinaryHDV.random(100)
def test_serialization_roundtrip(self):
v = BinaryHDV.random(D)
raw = v.to_bytes()
assert len(raw) == D // 8
v2 = BinaryHDV.from_bytes(raw, D)
assert v == v2
class TestXORBinding:
def test_self_inverse(self):
"""a βŠ• a = 0 (zero vector)."""
a = BinaryHDV.random(D)
result = a.xor_bind(a)
assert result == BinaryHDV.zeros(D)
def test_commutative(self):
"""a βŠ• b = b βŠ• a."""
a = BinaryHDV.random(D)
b = BinaryHDV.random(D)
assert a.xor_bind(b) == b.xor_bind(a)
def test_associative(self):
"""(a βŠ• b) βŠ• c = a βŠ• (b βŠ• c)."""
a = BinaryHDV.random(D)
b = BinaryHDV.random(D)
c = BinaryHDV.random(D)
lhs = a.xor_bind(b).xor_bind(c)
rhs = a.xor_bind(b.xor_bind(c))
assert lhs == rhs
def test_xor_with_zeros_is_identity(self):
"""a βŠ• 0 = a."""
a = BinaryHDV.random(D)
z = BinaryHDV.zeros(D)
assert a.xor_bind(z) == a
def test_unbinding(self):
"""If c = a βŠ• b, then a = c βŠ• b (self-inverse property enables unbinding)."""
a = BinaryHDV.random(D)
b = BinaryHDV.random(D)
c = a.xor_bind(b)
recovered_a = c.xor_bind(b)
assert recovered_a == a
def test_binding_preserves_distance(self):
"""hamming(aβŠ•c, bβŠ•c) = hamming(a, b)."""
a = BinaryHDV.random(D)
b = BinaryHDV.random(D)
c = BinaryHDV.random(D)
dist_ab = a.hamming_distance(b)
dist_ac_bc = a.xor_bind(c).hamming_distance(b.xor_bind(c))
assert dist_ab == dist_ac_bc
class TestHammingDistance:
def test_self_distance_is_zero(self):
a = BinaryHDV.random(D)
assert a.hamming_distance(a) == 0
def test_inverse_is_max_distance(self):
"""hamming(a, ~a) = dimension."""
a = BinaryHDV.random(D)
assert a.hamming_distance(a.invert()) == D
def test_symmetry(self):
"""hamming(a, b) = hamming(b, a)."""
a = BinaryHDV.random(D)
b = BinaryHDV.random(D)
assert a.hamming_distance(b) == b.hamming_distance(a)
def test_triangle_inequality(self):
"""hamming(a, c) <= hamming(a, b) + hamming(b, c)."""
a = BinaryHDV.random(D)
b = BinaryHDV.random(D)
c = BinaryHDV.random(D)
assert a.hamming_distance(c) <= a.hamming_distance(b) + b.hamming_distance(c)
def test_random_vectors_near_half_dimension(self):
"""Random vectors should have Hamming distance β‰ˆ D/2."""
np.random.seed(42)
distances = []
for _ in range(50):
a = BinaryHDV.random(D)
b = BinaryHDV.random(D)
distances.append(a.hamming_distance(b))
mean_dist = np.mean(distances)
# Should be close to D/2 = 512 for D=1024
assert abs(mean_dist - D / 2) < D * 0.05 # Within 5% of expected
def test_similarity_score_range(self):
a = BinaryHDV.random(D)
b = BinaryHDV.random(D)
sim = a.similarity(b)
assert 0.0 <= sim <= 1.0
def test_normalized_distance_range(self):
a = BinaryHDV.random(D)
b = BinaryHDV.random(D)
nd = a.normalized_distance(b)
assert 0.0 <= nd <= 1.0
class TestPermutation:
def test_permute_zero_is_identity(self):
a = BinaryHDV.random(D)
assert a.permute(0) == a
def test_permute_full_cycle(self):
"""Permuting by D should return the original vector."""
a = BinaryHDV.random(D)
assert a.permute(D) == a
def test_permute_produces_different_vector(self):
"""Non-zero permutation should produce a (very likely) different vector."""
a = BinaryHDV.random(D)
b = a.permute(1)
assert a != b
def test_permute_is_invertible(self):
"""permute(k) followed by permute(-k) recovers original."""
a = BinaryHDV.random(D)
b = a.permute(7).permute(-7)
assert a == b
class TestMajorityBundle:
def test_single_vector_bundle(self):
"""Bundling a single vector returns that vector."""
a = BinaryHDV.random(D)
result = majority_bundle([a])
assert result == a
def test_bundled_vector_similar_to_inputs(self):
"""Bundle of {a, b, c} should be more similar to each input than random."""
np.random.seed(42)
a = BinaryHDV.random(D)
b = BinaryHDV.random(D)
c = BinaryHDV.random(D)
bundled = majority_bundle([a, b, c])
# Each input should be closer to the bundle than to a random vector
random_v = BinaryHDV.random(D)
for v in [a, b, c]:
sim_to_bundle = bundled.similarity(v)
sim_to_random = bundled.similarity(random_v)
assert sim_to_bundle > sim_to_random, (
f"Bundle should be more similar to its inputs than to random vectors. "
f"sim_to_bundle={sim_to_bundle:.3f}, sim_to_random={sim_to_random:.3f}"
)
def test_bundle_is_approximate(self):
"""Bundle is not exact β€” it's a lossy superposition."""
a = BinaryHDV.random(D)
b = BinaryHDV.random(D)
bundled = majority_bundle([a, b])
# Bundled vector should be similar but not identical to either input
assert bundled != a
assert bundled != b
assert bundled.similarity(a) > 0.5
assert bundled.similarity(b) > 0.5
def test_empty_bundle_raises(self):
with pytest.raises(AssertionError):
majority_bundle([])
class TestBatchOperations:
def test_batch_hamming_distance(self):
"""Batch Hamming should match individual computations."""
np.random.seed(42)
query = BinaryHDV.random(D)
n = 100
db = np.stack(
[BinaryHDV.random(D).data for _ in range(n)], axis=0
)
batch_distances = batch_hamming_distance(query, db)
assert batch_distances.shape == (n,)
# Verify against individual computations
for i in range(n):
individual = query.hamming_distance(
BinaryHDV(data=db[i], dimension=D)
)
assert batch_distances[i] == individual
def test_top_k_nearest(self):
"""Top-K should return the K closest vectors."""
np.random.seed(42)
query = BinaryHDV.random(D)
n = 50
db_vectors = [BinaryHDV.random(D) for _ in range(n)]
db = np.stack([v.data for v in db_vectors], axis=0)
# Make one vector very close to the query
close_vector = query.data.copy()
# Flip just a few bits
close_vector[0] ^= 0x03 # Flip 2 bits
db[0] = close_vector
results = top_k_nearest(query, db, k=5)
assert len(results) == 5
# First result should be index 0 (the close vector)
assert results[0][0] == 0
# Distances should be sorted ascending
for i in range(len(results) - 1):
assert results[i][1] <= results[i + 1][1]
class TestTextEncoder:
def test_encode_deterministic(self):
enc = TextEncoder(dimension=D)
v1 = enc.encode("hello world")
v2 = enc.encode("hello world")
assert v1 == v2
def test_different_texts_different_vectors(self):
enc = TextEncoder(dimension=D)
v1 = enc.encode("hello world")
v2 = enc.encode("goodbye moon")
assert v1 != v2
def test_similar_texts_more_similar(self):
"""Texts sharing words should be more similar than completely different texts."""
np.random.seed(42)
enc = TextEncoder(dimension=D)
v_base = enc.encode("the quick brown fox")
v_similar = enc.encode("the quick brown dog")
v_different = enc.encode("quantum computing research paper")
sim_similar = v_base.similarity(v_similar)
sim_different = v_base.similarity(v_different)
assert sim_similar > sim_different, (
f"Similar text should have higher similarity. "
f"sim_similar={sim_similar:.3f}, sim_different={sim_different:.3f}"
)
def test_encode_with_context(self):
enc = TextEncoder(dimension=D)
context = BinaryHDV.random(D)
v = enc.encode_with_context("hello world", context)
# Should be different from encoding without context
v_no_ctx = enc.encode("hello world")
assert v != v_no_ctx
# XOR with context should recover the content encoding
recovered = v.xor_bind(context)
assert recovered == v_no_ctx
def test_empty_text(self):
"""Empty text should still produce a valid vector."""
enc = TextEncoder(dimension=D)
v = enc.encode("")
assert v.dimension == D
assert v.data.shape == (D // 8,)
def test_token_caching(self):
enc = TextEncoder(dimension=D)
enc.encode("hello world")
assert "hello" in enc._token_cache
assert "world" in enc._token_cache
class TestFullDimension:
"""Tests at full 16,384 dimensions to verify scaling."""
def test_full_dim_roundtrip(self):
v = BinaryHDV.random(16384)
assert v.data.shape == (2048,) # 16384 / 8
raw = v.to_bytes()
assert len(raw) == 2048
v2 = BinaryHDV.from_bytes(raw, 16384)
assert v == v2
def test_full_dim_hamming(self):
a = BinaryHDV.random(16384)
b = BinaryHDV.random(16384)
dist = a.hamming_distance(b)
# Should be roughly D/2 = 8192
assert 6000 < dist < 10000
def test_full_dim_batch_search(self):
np.random.seed(42)
query = BinaryHDV.random(16384)
n = 1000
db = np.stack(
[BinaryHDV.random(16384).data for _ in range(n)], axis=0
)
results = top_k_nearest(query, db, k=10)
assert len(results) == 10
# Verify sorted
for i in range(len(results) - 1):
assert results[i][1] <= results[i + 1][1]