Spaces:
Paused
Paused
| """Test RAGLite's chunk splitting functionality.""" | |
| import numpy as np | |
| import pytest | |
| from raglite._split_chunks import split_chunks | |
| def test_edge_cases(sentences: list[str]) -> None: | |
| """Test chunk splitting edge cases.""" | |
| sentence_embeddings = np.ones((len(sentences), 768)).astype(np.float16) | |
| chunks, chunk_embeddings = split_chunks( | |
| sentences, sentence_embeddings, sentence_window_size=3, max_size=1440 | |
| ) | |
| assert isinstance(chunks, list) | |
| assert isinstance(chunk_embeddings, list) | |
| assert len(chunk_embeddings) == (len(chunks) if sentences else 1) | |
| assert all(isinstance(chunk, str) for chunk in chunks) | |
| assert all(isinstance(chunk_embedding, np.ndarray) for chunk_embedding in chunk_embeddings) | |
| assert all(ce.dtype == sentence_embeddings.dtype for ce in chunk_embeddings) | |
| assert sum(ce.shape[0] for ce in chunk_embeddings) == sentence_embeddings.shape[0] | |
| assert all(ce.shape[1] == sentence_embeddings.shape[1] for ce in chunk_embeddings) | |
| def test_long_sentence(sentences: list[str]) -> None: | |
| """Test chunking on sentences that are too long.""" | |
| sentence_embeddings = np.ones((len(sentences), 768)).astype(np.float16) | |
| with pytest.raises( | |
| ValueError, match="Sentence with length larger than chunk max_size detected." | |
| ): | |
| _ = split_chunks(sentences, sentence_embeddings, sentence_window_size=3, max_size=1440) | |