MnemoCore / tests /test_binary_hdv.py

Initial upload of MnemoCore

dbb04e4 verified 9 days ago

11.7 kB

	"""
	HAIM Test Suite — Binary HDV Tests
	===================================
	Tests for the core BinaryHDV operations (Phase 3.0).
	Validates mathematical properties of VSA operations.
	"""

	import numpy as np
	import pytest

	from mnemocore.core.binary_hdv import (
	BinaryHDV,
	TextEncoder,
	batch_hamming_distance,
	majority_bundle,
	top_k_nearest,
	)


	# Default test dimension (smaller for speed)
	D = 1024


	class TestBinaryHDVConstruction:
	def test_random_creates_valid_vector(self):
	v = BinaryHDV.random(D)
	assert v.dimension == D
	assert v.data.shape == (D // 8,)
	assert v.data.dtype == np.uint8

	def test_zeros(self):
	v = BinaryHDV.zeros(D)
	assert np.all(v.data == 0)

	def test_ones(self):
	v = BinaryHDV.ones(D)
	assert np.all(v.data == 0xFF)

	def test_from_seed_deterministic(self):
	v1 = BinaryHDV.from_seed("hello", D)
	v2 = BinaryHDV.from_seed("hello", D)
	assert v1 == v2

	def test_different_seeds_different_vectors(self):
	v1 = BinaryHDV.from_seed("hello", D)
	v2 = BinaryHDV.from_seed("world", D)
	assert v1 != v2

	def test_dimension_must_be_multiple_of_8(self):
	with pytest.raises(AssertionError):
	BinaryHDV.random(100)

	def test_serialization_roundtrip(self):
	v = BinaryHDV.random(D)
	raw = v.to_bytes()
	assert len(raw) == D // 8
	v2 = BinaryHDV.from_bytes(raw, D)
	assert v == v2


	class TestXORBinding:
	def test_self_inverse(self):
	"""a ⊕ a = 0 (zero vector)."""
	a = BinaryHDV.random(D)
	result = a.xor_bind(a)
	assert result == BinaryHDV.zeros(D)

	def test_commutative(self):
	"""a ⊕ b = b ⊕ a."""
	a = BinaryHDV.random(D)
	b = BinaryHDV.random(D)
	assert a.xor_bind(b) == b.xor_bind(a)

	def test_associative(self):
	"""(a ⊕ b) ⊕ c = a ⊕ (b ⊕ c)."""
	a = BinaryHDV.random(D)
	b = BinaryHDV.random(D)
	c = BinaryHDV.random(D)
	lhs = a.xor_bind(b).xor_bind(c)
	rhs = a.xor_bind(b.xor_bind(c))
	assert lhs == rhs

	def test_xor_with_zeros_is_identity(self):
	"""a ⊕ 0 = a."""
	a = BinaryHDV.random(D)
	z = BinaryHDV.zeros(D)
	assert a.xor_bind(z) == a

	def test_unbinding(self):
	"""If c = a ⊕ b, then a = c ⊕ b (self-inverse property enables unbinding)."""
	a = BinaryHDV.random(D)
	b = BinaryHDV.random(D)
	c = a.xor_bind(b)
	recovered_a = c.xor_bind(b)
	assert recovered_a == a

	def test_binding_preserves_distance(self):
	"""hamming(a⊕c, b⊕c) = hamming(a, b)."""
	a = BinaryHDV.random(D)
	b = BinaryHDV.random(D)
	c = BinaryHDV.random(D)
	dist_ab = a.hamming_distance(b)
	dist_ac_bc = a.xor_bind(c).hamming_distance(b.xor_bind(c))
	assert dist_ab == dist_ac_bc


	class TestHammingDistance:
	def test_self_distance_is_zero(self):
	a = BinaryHDV.random(D)
	assert a.hamming_distance(a) == 0

	def test_inverse_is_max_distance(self):
	"""hamming(a, ~a) = dimension."""
	a = BinaryHDV.random(D)
	assert a.hamming_distance(a.invert()) == D

	def test_symmetry(self):
	"""hamming(a, b) = hamming(b, a)."""
	a = BinaryHDV.random(D)
	b = BinaryHDV.random(D)
	assert a.hamming_distance(b) == b.hamming_distance(a)

	def test_triangle_inequality(self):
	"""hamming(a, c) <= hamming(a, b) + hamming(b, c)."""
	a = BinaryHDV.random(D)
	b = BinaryHDV.random(D)
	c = BinaryHDV.random(D)
	assert a.hamming_distance(c) <= a.hamming_distance(b) + b.hamming_distance(c)

	def test_random_vectors_near_half_dimension(self):
	"""Random vectors should have Hamming distance ≈ D/2."""
	np.random.seed(42)
	distances = []
	for _ in range(50):
	a = BinaryHDV.random(D)
	b = BinaryHDV.random(D)
	distances.append(a.hamming_distance(b))
	mean_dist = np.mean(distances)
	# Should be close to D/2 = 512 for D=1024
	assert abs(mean_dist - D / 2) < D * 0.05 # Within 5% of expected

	def test_similarity_score_range(self):
	a = BinaryHDV.random(D)
	b = BinaryHDV.random(D)
	sim = a.similarity(b)
	assert 0.0 <= sim <= 1.0

	def test_normalized_distance_range(self):
	a = BinaryHDV.random(D)
	b = BinaryHDV.random(D)
	nd = a.normalized_distance(b)
	assert 0.0 <= nd <= 1.0


	class TestPermutation:
	def test_permute_zero_is_identity(self):
	a = BinaryHDV.random(D)
	assert a.permute(0) == a

	def test_permute_full_cycle(self):
	"""Permuting by D should return the original vector."""
	a = BinaryHDV.random(D)
	assert a.permute(D) == a

	def test_permute_produces_different_vector(self):
	"""Non-zero permutation should produce a (very likely) different vector."""
	a = BinaryHDV.random(D)
	b = a.permute(1)
	assert a != b

	def test_permute_is_invertible(self):
	"""permute(k) followed by permute(-k) recovers original."""
	a = BinaryHDV.random(D)
	b = a.permute(7).permute(-7)
	assert a == b


	class TestMajorityBundle:
	def test_single_vector_bundle(self):
	"""Bundling a single vector returns that vector."""
	a = BinaryHDV.random(D)
	result = majority_bundle([a])
	assert result == a

	def test_bundled_vector_similar_to_inputs(self):
	"""Bundle of {a, b, c} should be more similar to each input than random."""
	np.random.seed(42)
	a = BinaryHDV.random(D)
	b = BinaryHDV.random(D)
	c = BinaryHDV.random(D)
	bundled = majority_bundle([a, b, c])

	# Each input should be closer to the bundle than to a random vector
	random_v = BinaryHDV.random(D)
	for v in [a, b, c]:
	sim_to_bundle = bundled.similarity(v)
	sim_to_random = bundled.similarity(random_v)
	assert sim_to_bundle > sim_to_random, (
	f"Bundle should be more similar to its inputs than to random vectors. "
	f"sim_to_bundle={sim_to_bundle:.3f}, sim_to_random={sim_to_random:.3f}"
	)

	def test_bundle_is_approximate(self):
	"""Bundle is not exact — it's a lossy superposition."""
	a = BinaryHDV.random(D)
	b = BinaryHDV.random(D)
	bundled = majority_bundle([a, b])
	# Bundled vector should be similar but not identical to either input
	assert bundled != a
	assert bundled != b
	assert bundled.similarity(a) > 0.5
	assert bundled.similarity(b) > 0.5

	def test_empty_bundle_raises(self):
	with pytest.raises(AssertionError):
	majority_bundle([])


	class TestBatchOperations:
	def test_batch_hamming_distance(self):
	"""Batch Hamming should match individual computations."""
	np.random.seed(42)
	query = BinaryHDV.random(D)
	n = 100
	db = np.stack(
	[BinaryHDV.random(D).data for _ in range(n)], axis=0
	)

	batch_distances = batch_hamming_distance(query, db)
	assert batch_distances.shape == (n,)

	# Verify against individual computations
	for i in range(n):
	individual = query.hamming_distance(
	BinaryHDV(data=db[i], dimension=D)
	)
	assert batch_distances[i] == individual

	def test_top_k_nearest(self):
	"""Top-K should return the K closest vectors."""
	np.random.seed(42)
	query = BinaryHDV.random(D)
	n = 50
	db_vectors = [BinaryHDV.random(D) for _ in range(n)]
	db = np.stack([v.data for v in db_vectors], axis=0)

	# Make one vector very close to the query
	close_vector = query.data.copy()
	# Flip just a few bits
	close_vector[0] ^= 0x03 # Flip 2 bits
	db[0] = close_vector

	results = top_k_nearest(query, db, k=5)
	assert len(results) == 5
	# First result should be index 0 (the close vector)
	assert results[0][0] == 0
	# Distances should be sorted ascending
	for i in range(len(results) - 1):
	assert results[i][1] <= results[i + 1][1]


	class TestTextEncoder:
	def test_encode_deterministic(self):
	enc = TextEncoder(dimension=D)
	v1 = enc.encode("hello world")
	v2 = enc.encode("hello world")
	assert v1 == v2

	def test_different_texts_different_vectors(self):
	enc = TextEncoder(dimension=D)
	v1 = enc.encode("hello world")
	v2 = enc.encode("goodbye moon")
	assert v1 != v2

	def test_similar_texts_more_similar(self):
	"""Texts sharing words should be more similar than completely different texts."""
	np.random.seed(42)
	enc = TextEncoder(dimension=D)
	v_base = enc.encode("the quick brown fox")
	v_similar = enc.encode("the quick brown dog")
	v_different = enc.encode("quantum computing research paper")

	sim_similar = v_base.similarity(v_similar)
	sim_different = v_base.similarity(v_different)
	assert sim_similar > sim_different, (
	f"Similar text should have higher similarity. "
	f"sim_similar={sim_similar:.3f}, sim_different={sim_different:.3f}"
	)

	def test_encode_with_context(self):
	enc = TextEncoder(dimension=D)
	context = BinaryHDV.random(D)
	v = enc.encode_with_context("hello world", context)
	# Should be different from encoding without context
	v_no_ctx = enc.encode("hello world")
	assert v != v_no_ctx
	# XOR with context should recover the content encoding
	recovered = v.xor_bind(context)
	assert recovered == v_no_ctx

	def test_empty_text(self):
	"""Empty text should still produce a valid vector."""
	enc = TextEncoder(dimension=D)
	v = enc.encode("")
	assert v.dimension == D
	assert v.data.shape == (D // 8,)

	def test_token_caching(self):
	enc = TextEncoder(dimension=D)
	enc.encode("hello world")
	assert "hello" in enc._token_cache
	assert "world" in enc._token_cache


	class TestFullDimension:
	"""Tests at full 16,384 dimensions to verify scaling."""

	def test_full_dim_roundtrip(self):
	v = BinaryHDV.random(16384)
	assert v.data.shape == (2048,) # 16384 / 8
	raw = v.to_bytes()
	assert len(raw) == 2048
	v2 = BinaryHDV.from_bytes(raw, 16384)
	assert v == v2

	def test_full_dim_hamming(self):
	a = BinaryHDV.random(16384)
	b = BinaryHDV.random(16384)
	dist = a.hamming_distance(b)
	# Should be roughly D/2 = 8192
	assert 6000 < dist < 10000

	def test_full_dim_batch_search(self):
	np.random.seed(42)
	query = BinaryHDV.random(16384)
	n = 1000
	db = np.stack(
	[BinaryHDV.random(16384).data for _ in range(n)], axis=0
	)
	results = top_k_nearest(query, db, k=10)
	assert len(results) == 10
	# Verify sorted
	for i in range(len(results) - 1):
	assert results[i][1] <= results[i + 1][1]