Skip to content

Data Models

This section documents the data models and structures used throughout the EVE Pipeline.

Document Model

The primary data structure representing documents in the pipeline.

Unified document object that encapsulates content and metadata throughout the pipeline.

This replaces the need to pass (Path, str) tuples and provides a consistent interface for document handling across all pipeline stages.

Attributes:

Name Type Description
content str

The actual document text content

file_path Path

Path to the source file

file_format str

Format of the source file (pdf, md, html, etc.)

metadata Dict[str, Any]

Original metadata from the document (preserved from source)

embedding Optional[List[float]]

Optional embedding vector for the document

pipeline_metadata Dict[str, Any]

Metadata added by pipeline steps (filters, processing, etc.)

Source code in eve/model/document.py
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
@dataclass
class Document:
    """
    Unified document object that encapsulates content and metadata throughout the pipeline.

    This replaces the need to pass (Path, str) tuples and provides a consistent
    interface for document handling across all pipeline stages.

    Attributes:
        content: The actual document text content
        file_path: Path to the source file
        file_format: Format of the source file (pdf, md, html, etc.)
        metadata: Original metadata from the document (preserved from source)
        embedding: Optional embedding vector for the document
        pipeline_metadata: Metadata added by pipeline steps (filters, processing, etc.)
    """

    content: str
    file_path: Path
    file_format: str
    metadata: Dict[str, Any] = field(default_factory=dict)
    embedding: Optional[List[float]] = None
    pipeline_metadata: Dict[str, Any] = field(default_factory=dict)

    def __dict__(self) -> Dict[str, Any]:
        result = {
            "content": self.content,
            "file_path": str(self.file_path),
            "file_format": self.file_format,
            "metadata": self.metadata.copy(),
            "pipeline_metadata": self.pipeline_metadata.copy(),
        }
        if self.embedding is not None:
            result["embedding"] = self.embedding
        return result

    def __hash__(self):
        return hash(self.file_path)

    def __eq__(self, other):
        return isinstance(other, Document) and self.file_path == other.file_path

    @property
    def filename(self) -> str:
        """Get the filename without path."""
        return self.file_path.name

    @property
    def extension(self) -> str:
        """Get the file extension."""
        return self.file_path.suffix.lstrip(".")

    @property
    def content_length(self) -> int:
        """Get the length of the content."""
        return len(self.content)

    def is_empty(self) -> bool:
        """Check if the document content is empty."""
        return not self.content.strip()

    def add_metadata(self, key: str, value: Any) -> None:
        """Add an entry to the original metadata."""
        self.metadata[key] = value

    def get_metadata(self, key: str, default: Any = None) -> Any:
        """Get a value from the original metadata with optional default."""
        return self.metadata.get(key, default)

    def add_pipeline_metadata(self, key: str, value: Any) -> None:
        """Add an entry to pipeline metadata (for tracking pipeline processing)."""
        self.pipeline_metadata[key] = value

    def get_pipeline_metadata(self, key: str, default: Any = None) -> Any:
        """Get a value from pipeline metadata with optional default."""
        return self.pipeline_metadata.get(key, default)

    def update_content(self, new_content: str) -> None:
        """Update the document content and track the change in metadata."""
        old_length = self.content_length
        self.content = new_content
        new_length = self.content_length

        # Track content changes in metadata
        changes = self.metadata.get("content_changes", [])
        changes.append(
            {
                "old_length": old_length,
                "new_length": new_length,
                "size_change": new_length - old_length,
            }
        )
        self.metadata["content_changes"] = changes

    @classmethod
    def from_path_and_content(
        cls, file_path: Path, content: str, **metadata
    ) -> "Document":
        """Create a Document from a file path and content string."""
        return cls(content=content, file_path=file_path, metadata=metadata)

    @classmethod
    def from_tuple(cls, path_content_tuple: tuple[Path, str], **metadata) -> "Document":
        """Create a Document from a (Path, str) tuple for backwards compatibility."""
        file_path, content = path_content_tuple
        return cls.from_path_and_content(file_path, content, **metadata)

    def to_tuple(self) -> tuple[Path, str]:
        """Convert to (Path, str) tuple for backwards compatibility."""
        return (self.file_path, self.content)

    def __str__(self) -> str:
        """String representation showing filename and content length."""
        return f"Document({self.filename}, {self.file_format} format)"

    def __repr__(self) -> str:
        """Detailed representation."""
        return f"Document(file_path={self.file_path}, format={self.file_format}, metadata_keys={list(self.metadata.keys())})"

content_length property

Get the length of the content.

extension property

Get the file extension.

filename property

Get the filename without path.

__repr__()

Detailed representation.

Source code in eve/model/document.py
123
124
125
def __repr__(self) -> str:
    """Detailed representation."""
    return f"Document(file_path={self.file_path}, format={self.file_format}, metadata_keys={list(self.metadata.keys())})"

__str__()

String representation showing filename and content length.

Source code in eve/model/document.py
119
120
121
def __str__(self) -> str:
    """String representation showing filename and content length."""
    return f"Document({self.filename}, {self.file_format} format)"

add_metadata(key, value)

Add an entry to the original metadata.

Source code in eve/model/document.py
69
70
71
def add_metadata(self, key: str, value: Any) -> None:
    """Add an entry to the original metadata."""
    self.metadata[key] = value

add_pipeline_metadata(key, value)

Add an entry to pipeline metadata (for tracking pipeline processing).

Source code in eve/model/document.py
77
78
79
def add_pipeline_metadata(self, key: str, value: Any) -> None:
    """Add an entry to pipeline metadata (for tracking pipeline processing)."""
    self.pipeline_metadata[key] = value

from_path_and_content(file_path, content, **metadata) classmethod

Create a Document from a file path and content string.

Source code in eve/model/document.py
102
103
104
105
106
107
@classmethod
def from_path_and_content(
    cls, file_path: Path, content: str, **metadata
) -> "Document":
    """Create a Document from a file path and content string."""
    return cls(content=content, file_path=file_path, metadata=metadata)

from_tuple(path_content_tuple, **metadata) classmethod

Create a Document from a (Path, str) tuple for backwards compatibility.

Source code in eve/model/document.py
109
110
111
112
113
@classmethod
def from_tuple(cls, path_content_tuple: tuple[Path, str], **metadata) -> "Document":
    """Create a Document from a (Path, str) tuple for backwards compatibility."""
    file_path, content = path_content_tuple
    return cls.from_path_and_content(file_path, content, **metadata)

get_metadata(key, default=None)

Get a value from the original metadata with optional default.

Source code in eve/model/document.py
73
74
75
def get_metadata(self, key: str, default: Any = None) -> Any:
    """Get a value from the original metadata with optional default."""
    return self.metadata.get(key, default)

get_pipeline_metadata(key, default=None)

Get a value from pipeline metadata with optional default.

Source code in eve/model/document.py
81
82
83
def get_pipeline_metadata(self, key: str, default: Any = None) -> Any:
    """Get a value from pipeline metadata with optional default."""
    return self.pipeline_metadata.get(key, default)

is_empty()

Check if the document content is empty.

Source code in eve/model/document.py
65
66
67
def is_empty(self) -> bool:
    """Check if the document content is empty."""
    return not self.content.strip()

to_tuple()

Convert to (Path, str) tuple for backwards compatibility.

Source code in eve/model/document.py
115
116
117
def to_tuple(self) -> tuple[Path, str]:
    """Convert to (Path, str) tuple for backwards compatibility."""
    return (self.file_path, self.content)

update_content(new_content)

Update the document content and track the change in metadata.

Source code in eve/model/document.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def update_content(self, new_content: str) -> None:
    """Update the document content and track the change in metadata."""
    old_length = self.content_length
    self.content = new_content
    new_length = self.content_length

    # Track content changes in metadata
    changes = self.metadata.get("content_changes", [])
    changes.append(
        {
            "old_length": old_length,
            "new_length": new_length,
            "size_change": new_length - old_length,
        }
    )
    self.metadata["content_changes"] = changes

Configuration Models

Data models for pipeline configuration.

Inputs Configuration

Bases: BaseModel

Source code in eve/config.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
class Inputs(BaseModel):
    mode: str = "file"  # file | directory
    path: Union[str, list[str]]

    def get_files(self) -> list[Path]:
        paths = [self.path] if isinstance(self.path, str) else self.path
        files = []

        for p in paths:
            p = Path(p)

            if p.is_file():
                files.append(p)
            elif p.is_dir():
                files.extend([f for f in p.rglob("*") if f.is_file()]) # recursive search across multiple levels
        return files

Pipeline Configuration

Bases: BaseModel

Source code in eve/config.py
25
26
27
28
29
30
31
32
33
34
35
36
class PipelineConfig(BaseModel):
    batch_size: int = 20
    inputs: Inputs
    stages: list[dict[str, Any]]  # list of dict since we have stage name + stage configs

    @validator("stages")
    def check_stages(cls, v):
        allowed = {"ingestion", "cleaning", "export", "duplication", "extraction", "pii", "metadata", "chunker", "export_jsonl", "perplexity", "pii_filter", "length_filter", "newline_filter", "reference_filter", "qdrant_upload"}
        for stage in v:
            if stage["name"] not in allowed:
                raise ValueError(f"Unsupported stage: {stage['name']}. Allowed: {allowed}")
        return v