Spaces:

mrowaisabdullah
/

ai-humanoid-robotics

Paused

ai-humanoid-robotics / tests /unit /test_chunking.py

GitHub Actions

Deploy backend from GitHub Actions

bc8608f 15 days ago

7.1 kB

	"""Unit tests for document chunking functionality."""

	import pytest
	import hashlib
	from unittest.mock import Mock, patch
	from backend.rag.chunking import MarkdownChunker


	class TestMarkdownChunker:
	"""Test cases for MarkdownChunker class."""

	@pytest.fixture
	def chunker(self):
	"""Create a chunker instance for testing."""
	return MarkdownChunker(
	target_chunk_size=600,
	overlap_size=100
	)

	@pytest.fixture
	def sample_markdown(self):
	"""Sample markdown content for testing."""
	return """
	# Chapter 1: Introduction

	This is the introduction chapter with some content.

	## Section 1.1

	Here's some detailed content that should be chunked properly.

	```python
	def example_function():
	print("This is a code block that shouldn't be split")
	return True
	```

	More text after the code block.

	## Section 1.2

	Additional content in this section.

	# How to Use This Book

	This is a template section that should be filtered out.

	## Getting Started

	Template content here.
	"""

	def test_chunking_basic_functionality(self, chunker, sample_markdown):
	"""Test basic chunking functionality."""
	chunks = chunker.chunk_document(sample_markdown)

	assert len(chunks) > 0, "Should generate at least one chunk"

	# Check that chunks have required fields
	for chunk in chunks:
	assert hasattr(chunk, 'content'), "Chunk must have content"
	assert hasattr(chunk, 'metadata'), "Chunk must have metadata"
	assert 'file_path' in chunk.metadata, "Chunk must have file_path metadata"
	assert 'chunk_index' in chunk.metadata, "Chunk must have chunk_index metadata"
	assert 'content_hash' in chunk.metadata, "Chunk must have content_hash metadata"
	assert 'token_count' in chunk.metadata, "Chunk must have token_count metadata"

	def test_template_filtering(self, chunker):
	"""Test that template content is properly filtered."""
	template_content = """
	# How to Use This Book

	This is template content.

	## Getting Started

	More template content.
	"""

	chunks = chunker.chunk_document(template_content)

	# All chunks from template content should be marked as template
	for chunk in chunks:
	assert chunk.metadata.get('is_template', False),
	"Template content should be marked as is_template=True"
	assert 'How to Use This Book' in chunk.content or 'Getting Started' in chunk.content

	def test_content_hash_generation(self, chunker):
	"""Test that SHA256 hashes are generated correctly."""
	content = "Test content for hashing"
	expected_hash = hashlib.sha256(content.encode('utf-8')).hexdigest()

	chunk = chunker.chunk_document(content)[0]
	actual_hash = chunk.metadata['content_hash']

	assert actual_hash == expected_hash, "Content hash should match SHA256 of content"

	def test_token_count_accuracy(self, chunker):
	"""Test that token counting is accurate."""
	# Create content with known token count
	content = "This is a test sentence. " * 50 # Approximately 100 tokens

	chunk = chunker.chunk_document(content)[0]
	token_count = chunk.metadata['token_count']

	# Should be approximately 100 tokens (allowing some variance)
	assert 90 <= token_count <= 110, f"Expected ~100 tokens, got {token_count}"

	def test_chunk_size_limits(self, chunker):
	"""Test that chunks respect size limits."""
	large_content = "This is test content. " * 200 # Large content

	chunks = chunker.chunk_document(large_content)

	for chunk in chunks:
	# Check that chunks are not too large
	assert chunk.metadata['token_count'] <= 650, # Allow some tolerance
	f"Chunk too large: {chunk.metadata['token_count']} tokens"

	def test_overlap_handling(self, chunker):
	"""Test that overlapping chunks share content."""
	content = """
	Chapter 1
	Section 1.1 with some content here.
	Section 1.2 with different content here.
	Section 1.3 with final content here.
	"""

	chunks = chunker.chunk_document(content)

	if len(chunks) > 1:
	# Check that consecutive chunks have some overlap
	# This is a simplified check - in practice, you'd check for actual text overlap
	assert chunks[0].metadata['chunk_index'] == 0
	assert chunks[1].metadata['chunk_index'] == 1

	def test_metadata_consistency(self, chunker):
	"""Test that metadata is consistent across chunks."""
	file_path = "/test/path/document.md"
	chunks = chunker.chunk_document("Test content", file_path=file_path)

	for chunk in chunks:
	assert chunk.metadata['file_path'] == file_path,
	"File path should be consistent across chunks"
	assert isinstance(chunk.metadata['chunk_index'], int),
	"Chunk index should be an integer"
	assert isinstance(chunk.metadata['token_count'], int),
	"Token count should be an integer"

	@patch('backend.rag.chunking.tiktoken.get_encoding')
	def test_token_counting_with_tiktoken(self, mock_get_encoding, chunker):
	"""Test token counting using tiktoken."""
	# Mock tiktoken encoding
	mock_encoding = Mock()
	mock_encoding.encode.return_value = [1, 2, 3, 4, 5] # 5 tokens
	mock_get_encoding.return_value = mock_encoding

	content = "Test content"
	chunk = chunker.chunk_document(content)[0]

	# Verify tiktoken was used
	mock_get_encoding.assert_called_with("cl100k_base")
	assert chunk.metadata['token_count'] == 5

	def test_empty_content_handling(self, chunker):
	"""Test handling of empty or minimal content."""
	# Empty string
	chunks = chunker.chunk_document("")
	assert len(chunks) == 0, "Empty content should produce no chunks"

	# Minimal content
	chunks = chunker.chunk_document("Hi")
	assert len(chunks) > 0, "Minimal content should still produce a chunk"

	def test_code_block_preservation(self, chunker):
	"""Test that code blocks are not split."""
	code_content = """
	# Code Example

	Here's some introduction text.

	```python
	def long_function_name():
	"""This is a multi-line function.
	It has many lines of code that shouldn't be split
	even though it's quite long.
	"""
	x = 1
	y = 2
	return x + y
	```

	Here's some conclusion text.
	"""

	chunks = chunker.chunk_document(code_content)

	# Find the chunk containing the code block
	code_chunks = [c for c in chunks if 'def long_function_name' in c.content]

	if code_chunks:
	# The code block should be intact in a single chunk
	code_chunk = code_chunks[0]
	assert 'def long_function_name' in code_chunk.content
	assert 'return x + y' in code_chunk.content
	assert 'Here\'s some conclusion text' not in code_chunk.content