|
|
"""Unit tests for document chunking functionality.""" |
|
|
|
|
|
import pytest |
|
|
import hashlib |
|
|
from unittest.mock import Mock, patch |
|
|
from backend.rag.chunking import MarkdownChunker |
|
|
|
|
|
|
|
|
class TestMarkdownChunker: |
|
|
"""Test cases for MarkdownChunker class.""" |
|
|
|
|
|
@pytest.fixture |
|
|
def chunker(self): |
|
|
"""Create a chunker instance for testing.""" |
|
|
return MarkdownChunker( |
|
|
target_chunk_size=600, |
|
|
overlap_size=100 |
|
|
) |
|
|
|
|
|
@pytest.fixture |
|
|
def sample_markdown(self): |
|
|
"""Sample markdown content for testing.""" |
|
|
return """ |
|
|
# Chapter 1: Introduction |
|
|
|
|
|
This is the introduction chapter with some content. |
|
|
|
|
|
## Section 1.1 |
|
|
|
|
|
Here's some detailed content that should be chunked properly. |
|
|
|
|
|
```python |
|
|
def example_function(): |
|
|
print("This is a code block that shouldn't be split") |
|
|
return True |
|
|
``` |
|
|
|
|
|
More text after the code block. |
|
|
|
|
|
## Section 1.2 |
|
|
|
|
|
Additional content in this section. |
|
|
|
|
|
# How to Use This Book |
|
|
|
|
|
This is a template section that should be filtered out. |
|
|
|
|
|
## Getting Started |
|
|
|
|
|
Template content here. |
|
|
""" |
|
|
|
|
|
def test_chunking_basic_functionality(self, chunker, sample_markdown): |
|
|
"""Test basic chunking functionality.""" |
|
|
chunks = chunker.chunk_document(sample_markdown) |
|
|
|
|
|
assert len(chunks) > 0, "Should generate at least one chunk" |
|
|
|
|
|
|
|
|
for chunk in chunks: |
|
|
assert hasattr(chunk, 'content'), "Chunk must have content" |
|
|
assert hasattr(chunk, 'metadata'), "Chunk must have metadata" |
|
|
assert 'file_path' in chunk.metadata, "Chunk must have file_path metadata" |
|
|
assert 'chunk_index' in chunk.metadata, "Chunk must have chunk_index metadata" |
|
|
assert 'content_hash' in chunk.metadata, "Chunk must have content_hash metadata" |
|
|
assert 'token_count' in chunk.metadata, "Chunk must have token_count metadata" |
|
|
|
|
|
def test_template_filtering(self, chunker): |
|
|
"""Test that template content is properly filtered.""" |
|
|
template_content = """ |
|
|
# How to Use This Book |
|
|
|
|
|
This is template content. |
|
|
|
|
|
## Getting Started |
|
|
|
|
|
More template content. |
|
|
""" |
|
|
|
|
|
chunks = chunker.chunk_document(template_content) |
|
|
|
|
|
|
|
|
for chunk in chunks: |
|
|
assert chunk.metadata.get('is_template', False), |
|
|
"Template content should be marked as is_template=True" |
|
|
assert 'How to Use This Book' in chunk.content or 'Getting Started' in chunk.content |
|
|
|
|
|
def test_content_hash_generation(self, chunker): |
|
|
"""Test that SHA256 hashes are generated correctly.""" |
|
|
content = "Test content for hashing" |
|
|
expected_hash = hashlib.sha256(content.encode('utf-8')).hexdigest() |
|
|
|
|
|
chunk = chunker.chunk_document(content)[0] |
|
|
actual_hash = chunk.metadata['content_hash'] |
|
|
|
|
|
assert actual_hash == expected_hash, "Content hash should match SHA256 of content" |
|
|
|
|
|
def test_token_count_accuracy(self, chunker): |
|
|
"""Test that token counting is accurate.""" |
|
|
|
|
|
content = "This is a test sentence. " * 50 |
|
|
|
|
|
chunk = chunker.chunk_document(content)[0] |
|
|
token_count = chunk.metadata['token_count'] |
|
|
|
|
|
|
|
|
assert 90 <= token_count <= 110, f"Expected ~100 tokens, got {token_count}" |
|
|
|
|
|
def test_chunk_size_limits(self, chunker): |
|
|
"""Test that chunks respect size limits.""" |
|
|
large_content = "This is test content. " * 200 |
|
|
|
|
|
chunks = chunker.chunk_document(large_content) |
|
|
|
|
|
for chunk in chunks: |
|
|
|
|
|
assert chunk.metadata['token_count'] <= 650, |
|
|
f"Chunk too large: {chunk.metadata['token_count']} tokens" |
|
|
|
|
|
def test_overlap_handling(self, chunker): |
|
|
"""Test that overlapping chunks share content.""" |
|
|
content = """ |
|
|
Chapter 1 |
|
|
Section 1.1 with some content here. |
|
|
Section 1.2 with different content here. |
|
|
Section 1.3 with final content here. |
|
|
""" |
|
|
|
|
|
chunks = chunker.chunk_document(content) |
|
|
|
|
|
if len(chunks) > 1: |
|
|
|
|
|
|
|
|
assert chunks[0].metadata['chunk_index'] == 0 |
|
|
assert chunks[1].metadata['chunk_index'] == 1 |
|
|
|
|
|
def test_metadata_consistency(self, chunker): |
|
|
"""Test that metadata is consistent across chunks.""" |
|
|
file_path = "/test/path/document.md" |
|
|
chunks = chunker.chunk_document("Test content", file_path=file_path) |
|
|
|
|
|
for chunk in chunks: |
|
|
assert chunk.metadata['file_path'] == file_path, |
|
|
"File path should be consistent across chunks" |
|
|
assert isinstance(chunk.metadata['chunk_index'], int), |
|
|
"Chunk index should be an integer" |
|
|
assert isinstance(chunk.metadata['token_count'], int), |
|
|
"Token count should be an integer" |
|
|
|
|
|
@patch('backend.rag.chunking.tiktoken.get_encoding') |
|
|
def test_token_counting_with_tiktoken(self, mock_get_encoding, chunker): |
|
|
"""Test token counting using tiktoken.""" |
|
|
|
|
|
mock_encoding = Mock() |
|
|
mock_encoding.encode.return_value = [1, 2, 3, 4, 5] |
|
|
mock_get_encoding.return_value = mock_encoding |
|
|
|
|
|
content = "Test content" |
|
|
chunk = chunker.chunk_document(content)[0] |
|
|
|
|
|
|
|
|
mock_get_encoding.assert_called_with("cl100k_base") |
|
|
assert chunk.metadata['token_count'] == 5 |
|
|
|
|
|
def test_empty_content_handling(self, chunker): |
|
|
"""Test handling of empty or minimal content.""" |
|
|
|
|
|
chunks = chunker.chunk_document("") |
|
|
assert len(chunks) == 0, "Empty content should produce no chunks" |
|
|
|
|
|
|
|
|
chunks = chunker.chunk_document("Hi") |
|
|
assert len(chunks) > 0, "Minimal content should still produce a chunk" |
|
|
|
|
|
def test_code_block_preservation(self, chunker): |
|
|
"""Test that code blocks are not split.""" |
|
|
code_content = """ |
|
|
# Code Example |
|
|
|
|
|
Here's some introduction text. |
|
|
|
|
|
```python |
|
|
def long_function_name(): |
|
|
"""This is a multi-line function. |
|
|
It has many lines of code that shouldn't be split |
|
|
even though it's quite long. |
|
|
""" |
|
|
x = 1 |
|
|
y = 2 |
|
|
return x + y |
|
|
``` |
|
|
|
|
|
Here's some conclusion text. |
|
|
""" |
|
|
|
|
|
chunks = chunker.chunk_document(code_content) |
|
|
|
|
|
|
|
|
code_chunks = [c for c in chunks if 'def long_function_name' in c.content] |
|
|
|
|
|
if code_chunks: |
|
|
|
|
|
code_chunk = code_chunks[0] |
|
|
assert 'def long_function_name' in code_chunk.content |
|
|
assert 'return x + y' in code_chunk.content |
|
|
assert 'Here\'s some conclusion text' not in code_chunk.content |