Skip to content

Utilities

This section covers utility functions and helper modules used throughout the pipeline.

Common Utilities

General-purpose utility functions.

read_in_chunks(file_path, mode, chunk_size=4096) async

read a binary file in chunks.

Source code in eve/utils.py
25
26
27
28
29
30
31
async def read_in_chunks(file_path: Path, mode: str, chunk_size: int = 4096) -> AsyncGenerator[bytes, None]:
    """
    read a binary file in chunks.
    """
    async with aiofiles.open(file_path, mode) as f:
        while chunk := await f.read(chunk_size):
            yield chunk

HTTP Utils

HTTP client utilities for server-based processing.

Common HTTP utilities for making API calls across the pipeline.

make_openrouter_request(api_key, model, prompt, max_tokens=1000, temperature=0.1) async

Make a request to OpenRouter API for LLM completion.

Parameters:

Name Type Description Default
api_key str

OpenRouter API key

required
model str

Model name to use

required
prompt str

The prompt to send

required
max_tokens int

Maximum tokens in response

1000
temperature float

Temperature for response generation

0.1

Returns:

Type Description
Optional[str]

Response content or None if request failed

Source code in eve/common/http_utils.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
async def make_openrouter_request(
    api_key: str,
    model: str,
    prompt: str,
    max_tokens: int = 1000,
    temperature: float = 0.1
) -> Optional[str]:
    """
    Make a request to OpenRouter API for LLM completion.

    Args:
        api_key: OpenRouter API key
        model: Model name to use
        prompt: The prompt to send
        max_tokens: Maximum tokens in response
        temperature: Temperature for response generation

    Returns:
        Response content or None if request failed
    """
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "temperature": temperature
    }

    response = await post_request(url, headers, data)
    if response and "choices" in response:
        content = response["choices"][0]["message"]["content"].strip()
        import re
        content = re.sub(r'^```latex\n?', '', content)
        content = re.sub(r'\n?```$', '', content)
        return content.strip()

    return None

post_request(url, headers, data, timeout=30) async

Make an async POST request and return JSON response.

Parameters:

Name Type Description Default
url str

The URL to make the request to

required
headers Dict[str, str]

Request headers

required
data Dict[str, Any]

Request data to send as JSON

required
timeout int

Request timeout in seconds

30

Returns:

Type Description
Optional[Dict[str, Any]]

Response JSON data or None if request failed

Source code in eve/common/http_utils.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
async def post_request(
    url: str,
    headers: Dict[str, str],
    data: Dict[str, Any],
    timeout: int = 30
) -> Optional[Dict[str, Any]]:
    """
    Make an async POST request and return JSON response.

    Args:
        url: The URL to make the request to
        headers: Request headers
        data: Request data to send as JSON
        timeout: Request timeout in seconds

    Returns:
        Response JSON data or None if request failed
    """
    try:
        async with aiohttp.ClientSession() as session:
            async with session.post(
                url, 
                headers=headers, 
                json=data,
                timeout=aiohttp.ClientTimeout(total=timeout)
            ) as response:
                if response.status == 200:
                    return await response.json()
                else:
                    logger.error(f"HTTP request failed with status {response.status}")
                    return None
    except Exception as e:
        logger.error(f"HTTP request failed: {str(e)}")
        return None

Regex Patterns

Common regular expression patterns used throughout the pipeline.

Common regex patterns used across the pipeline.

clean_doubled_backslashes(text)

Clean up doubled backslashes in LaTeX content.

Source code in eve/common/regex_patterns.py
76
77
78
def clean_doubled_backslashes(text: str) -> str:
    """Clean up doubled backslashes in LaTeX content."""
    return DOUBLED_BACKSLASH_PATTERN.sub(lambda m: '\\' * (len(m.group()) // 2), text)

extract_html_meta_tags(html_content)

Extract metadata from HTML meta tags.

Parameters:

Name Type Description Default
html_content str

HTML content as string

required

Returns:

Type Description
dict[str, str]

Dictionary containing extracted meta tag information

Source code in eve/common/regex_patterns.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def extract_html_meta_tags(html_content: str) -> dict[str, str]:
    """
    Extract metadata from HTML meta tags.

    Args:
        html_content: HTML content as string

    Returns:
        Dictionary containing extracted meta tag information
    """
    meta_data = {}

    if not html_content:
        return meta_data

    meta_patterns = {
        'description': r'<meta[^>]*name=["\']description["\'][^>]*content=["\']([^"\']*)["\']',
        'keywords': r'<meta[^>]*name=["\']keywords["\'][^>]*content=["\']([^"\']*)["\']',
        'author': r'<meta[^>]*name=["\']author["\'][^>]*content=["\']([^"\']*)["\']',
        'og_title': r'<meta[^>]*property=["\']og:title["\'][^>]*content=["\']([^"\']*)["\']',
        'og_description': r'<meta[^>]*property=["\']og:description["\'][^>]*content=["\']([^"\']*)["\']',
        'twitter_title': r'<meta[^>]*name=["\']twitter:title["\'][^>]*content=["\']([^"\']*)["\']',
    }

    for key, pattern in meta_patterns.items():
        match = re.search(pattern, html_content, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            if value:
                meta_data[key] = value

    return meta_data

extract_html_title(html_content)

Extract title from HTML content.

Parameters:

Name Type Description Default
html_content str

HTML content as string

required

Returns:

Type Description
str

Extracted and cleaned title, or None if not found

Source code in eve/common/regex_patterns.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def extract_html_title(html_content: str) -> str:
    """
    Extract title from HTML content.

    Args:
        html_content: HTML content as string

    Returns:
        Extracted and cleaned title, or None if not found
    """
    if not html_content:
        return None

    title_match = HTML_TITLE_PATTERN.search(html_content)

    if title_match:
        title = title_match.group(1)

        title = HTML_TAG_PATTERN.sub('', title)
        title = HTML_ENTITY_PATTERN.sub(' ', title)
        title = HTML_NUMERIC_ENTITY_PATTERN.sub(' ', title)

        return title.strip()

    return None

extract_json_ld_count(html_content)

Count JSON-LD structured data blocks in HTML.

Parameters:

Name Type Description Default
html_content str

HTML content as string

required

Returns:

Type Description
int

Number of JSON-LD script blocks found

Source code in eve/common/regex_patterns.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def extract_json_ld_count(html_content: str) -> int:
    """
    Count JSON-LD structured data blocks in HTML.

    Args:
        html_content: HTML content as string

    Returns:
        Number of JSON-LD script blocks found
    """
    if not html_content:
        return 0

    json_ld_matches = JSON_LD_SCRIPT_PATTERN.findall(html_content)
    return len(json_ld_matches)

fix_ocr_digit_letter_spacing(text)

Fix OCR issues where digits are concatenated with letters.

Source code in eve/common/regex_patterns.py
 99
100
101
def fix_ocr_digit_letter_spacing(text: str) -> str:
    """Fix OCR issues where digits are concatenated with letters."""
    return DIGIT_LETTER_PATTERN.sub(r'\1 \2', text)

get_latex_formula_patterns()

Get all LaTeX formula patterns in a dictionary.

Returns:

Type Description
dict[str, Pattern[str]]

Dictionary mapping pattern names to compiled regex patterns

Source code in eve/common/regex_patterns.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def get_latex_formula_patterns() -> dict[str, Pattern[str]]:
    """
    Get all LaTeX formula patterns in a dictionary.

    Returns:
        Dictionary mapping pattern names to compiled regex patterns
    """
    return {
        'inline': INLINE_MATH_PATTERN,
        'display': DISPLAY_MATH_PATTERN,
        'bracket': BRACKET_MATH_PATTERN,
        'square_bracket': SQUARE_BRACKET_MATH_PATTERN,
        'environment': LATEX_ENV_PATTERN
    }

normalize_excessive_newlines(text)

Replace 3+ consecutive newlines with exactly 2.

Source code in eve/common/regex_patterns.py
81
82
83
def normalize_excessive_newlines(text: str) -> str:
    """Replace 3+ consecutive newlines with exactly 2."""
    return EXCESSIVE_NEWLINES_PATTERN.sub('\n\n', text)

remove_nougat_artifacts(text)

Remove Nougat-specific warning and error artifacts.

Source code in eve/common/regex_patterns.py
104
105
106
107
108
109
def remove_nougat_artifacts(text: str) -> str:
    """Remove Nougat-specific warning and error artifacts."""
    text = WARNING_PATTERN.sub('', text)
    text = ERROR_PATTERN.sub('', text)
    text = text.replace('[MISSING_PAGE_POST]', '')
    return text

remove_single_symbol_lines(text)

Remove lines that contain only a single symbol or punctuation.

Source code in eve/common/regex_patterns.py
86
87
88
89
90
91
92
93
94
95
96
def remove_single_symbol_lines(text: str) -> str:
    """Remove lines that contain only a single symbol or punctuation."""
    lines = text.split('\n')
    cleaned_lines = []

    for line in lines:
        stripped = line.strip()
        if re.search(r'\w', stripped) or len(stripped) != 1:
            cleaned_lines.append(line)

    return '\n'.join(cleaned_lines)

Prompts

Prompt templates used in LLM-based processing.

Common prompts used across the pipeline.

get_latex_correction_prompt(formula_type, error_message, formula, context)

Generate a LaTeX correction prompt.

Parameters:

Name Type Description Default
formula_type str

Type of LaTeX formula (inline, display, etc.)

required
error_message str

The error message from LaTeX compilation

required
formula str

The problematic formula

required
context str

Surrounding context for better understanding

required

Returns:

Type Description
str

Formatted prompt string

Source code in eve/common/prompts.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def get_latex_correction_prompt(
    formula_type: str,
    error_message: str,
    formula: str,
    context: str
) -> str:
    """
    Generate a LaTeX correction prompt.

    Args:
        formula_type: Type of LaTeX formula (inline, display, etc.)
        error_message: The error message from LaTeX compilation
        formula: The problematic formula
        context: Surrounding context for better understanding

    Returns:
        Formatted prompt string
    """
    context_snippet = context[:1000] + "..." if len(context) > 1000 else context

    return LATEX_CORRECTION_PROMPT.format(
        formula_type=formula_type,
        error_message=error_message,
        formula=formula,
        context_snippet=context_snippet
    )

Logging

Logging configuration and utilities.