diff --git a/Dockerfile b/Dockerfile
index 60e0cba4bf..3f9343429f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,6 +12,7 @@ RUN apt-get update && \
apt-get install -y --no-install-recommends $(grep -vE "^\s*#" apt.txt | tr "\n" " ") && \
apt-get install libpq-dev postgresql-client -y --no-install-recommends && \
apt-get install poppler-utils -y && \
+ apt-get install default-jre -y && \
apt-get clean && \
apt-get purge && \
rm -rf /var/lib/apt/lists/*
diff --git a/learning_resources/converters/__init__.py b/learning_resources/converters/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/learning_resources/converters/opendataloader_llm_converter.py b/learning_resources/converters/opendataloader_llm_converter.py
new file mode 100644
index 0000000000..a6df9f5060
--- /dev/null
+++ b/learning_resources/converters/opendataloader_llm_converter.py
@@ -0,0 +1,766 @@
+"""
+PDF to Markdown converter using OpenDataLoader JSON output and LLM-based OCR.
+
+Strategy:
+1. Parse PDF structure.
+2. Calculate a "Math Density Score" for each page .
+3. Decision Logic:
+ - High Score (> Threshold): OCR the entire page (preserves complex layout/formulas).
+ - Low Score: Use standard parsed text and only OCR specific embedded images.
+"""
+
+import base64
+import gc
+import json
+import logging
+import re
+import tempfile
+import uuid
+from collections import defaultdict
+from dataclasses import dataclass, field
+from io import BytesIO
+from pathlib import Path
+from typing import Any
+
+import cv2
+import litellm
+import numpy as np
+import opendataloader_pdf
+import pdf2image
+from django.conf import settings
+from litellm import batch_completion
+from PIL import Image
+
+log = logging.getLogger(__name__)
+
+# --- Configuration ---
+MIN_IMAGE_DIMENSION = 32
+MIN_IMAGE_RATIO = 12
+IMAGE_BATCH_SIZE = 10
+PDF_POINTS_PER_INCH = 72
+
+# Score > 5 triggers full page OCR.
+
+
+MATH_FONTS = {"cmmi", "cmsy", "cmex", "msbm", "msam", "eufm", "dsrom", "wasy", "stmary"}
+
+
+BASIC_MATH_SYMBOLS = re.compile(r"[+=<>≤≥≠±\-]")
+# Regex for "Complex" math (Integrals, Sums, Keywords)
+COMPLEX_MATH_REGEX = re.compile(
+ r"([∑∏∫∂∇√∞∀∃∈⊂∪→⇒⇔αβγδεθλμπστφωΩΓΛΨ]|\b(lim|sin|cos|tan|log|ln|det|mod)\b)" # noqa: RUF001
+)
+
+
+# --- Content Block Types (from JsonName.java) ---
+class BlockType:
+ IMAGE = "image"
+ LIST_ITEM = "list item"
+ LINE = "line"
+ TABLE = "table"
+ TEXT_BLOCK = "text block"
+ LIST = "list"
+ TABLE_CELL = "table cell"
+ TABLE_ROW = "table row"
+ PARAGRAPH = "paragraph"
+ HEADING = "heading"
+ TEXT_CHUNK = "text chunk"
+ FULL_PAGE_OCR = "full_page_ocr"
+
+
+@dataclass
+class TableCell:
+ """Represents a cell in a table."""
+
+ content: str | None = None
+ column_number: int = 0
+ row_number: int = 0
+ column_span: int = 1
+ row_span: int = 1
+ kids: list[Any] = field(default_factory=list)
+
+
+@dataclass
+class TableRow:
+ """Represents a row in a table."""
+
+ row_number: int = 0
+ cells: list[TableCell] = field(default_factory=list)
+
+
+@dataclass
+class ListItem:
+ """Represents an item in a list."""
+
+ content: str | None = None
+ kids: list[Any] = field(default_factory=list)
+
+
+@dataclass
+class ContentBlock:
+ block_type: str
+ block_id: int
+ page_number: int
+ bounding_box: list[float]
+ content: str | None = None
+ heading_level: int | None = None
+ font: str | None = None
+ # Table-specific fields
+ rows: list[TableRow] = field(default_factory=list)
+ number_of_rows: int = 0
+ number_of_columns: int = 0
+ # List-specific fields
+ list_items: list[ListItem] = field(default_factory=list)
+ numbering_style: str | None = None
+ # Nested content (for complex structures)
+ kids: list[Any] = field(default_factory=list)
+
+
+@dataclass
+class ImageForOCR:
+ block_id: int
+ pil_image: Image.Image
+ is_full_page: bool
+
+
+def _image_to_base64_uri(pil_image: Image.Image) -> str:
+ buffer = BytesIO()
+ pil_image.save(buffer, format="JPEG", optimize=True)
+ image_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+ return f"data:image/jpeg;base64,{image_b64}"
+
+
+def _build_ocr_message(image_uri: str, prompt: str) -> list[dict]:
+ return [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": prompt},
+ {"type": "image_url", "image_url": {"url": image_uri}},
+ ],
+ }
+ ]
+
+
+def _is_valid_image_dimensions(width: int, height: int) -> bool:
+ """Check if image dimensions are valid for processing."""
+ if width < MIN_IMAGE_DIMENSION or height < MIN_IMAGE_DIMENSION:
+ return False
+
+ # Avoid extremely thin strips (often separator lines)
+ if height > 0:
+ aspect_ratio = width / height
+ if aspect_ratio > MIN_IMAGE_RATIO or aspect_ratio < (1 / MIN_IMAGE_RATIO):
+ return False
+
+ return True
+
+
+def _optimize_image(pil_image: Image.Image) -> Image.Image:
+ if pil_image.mode != "RGB":
+ pil_image = pil_image.convert("RGB")
+ np_image = np.array(pil_image)
+ UNIQUE_COLOR_THRESHOLD = 200
+ unique_colors = len(np.unique(np_image.reshape(-1, 3), axis=0))
+ if unique_colors > UNIQUE_COLOR_THRESHOLD:
+ processed = cv2.cvtColor(np_image, cv2.COLOR_RGB2GRAY)
+ processed = cv2.GaussianBlur(processed, (3, 3), 0)
+ return Image.fromarray(processed)
+ else:
+ gray = cv2.cvtColor(np_image, cv2.COLOR_RGB2GRAY)
+ _, processed = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+ return Image.fromarray(processed)
+
+
+class PDFPageRenderer:
+ def __init__(self, document_path: Path, dpi: int = 150):
+ self.document_path = document_path
+ self.dpi = dpi
+ self._page_cache: dict[int, Image.Image] = {}
+ self._scale = dpi / PDF_POINTS_PER_INCH
+
+ def get_page_image(self, page_number: int) -> Image.Image:
+ """
+ Get a specific page from the pdf as an image
+ """
+ if page_number not in self._page_cache:
+ images = pdf2image.convert_from_path(
+ self.document_path,
+ dpi=self.dpi,
+ first_page=page_number,
+ last_page=page_number,
+ )
+ self._page_cache[page_number] = images[0]
+ return self._page_cache[page_number].copy()
+
+ def extract_region(self, page_number: int, bbox: list[float]) -> Image.Image | None:
+ """
+ Clip a specific region on a page as an image
+ """
+ page_image = self.get_page_image(page_number)
+ page_width, page_height = page_image.size
+
+ pdf_x1, pdf_y1, pdf_x2, pdf_y2 = bbox
+ pil_x1 = int(pdf_x1 * self._scale)
+ pil_x2 = int(pdf_x2 * self._scale)
+ pil_y1 = int(page_height - (pdf_y2 * self._scale))
+ pil_y2 = int(page_height - (pdf_y1 * self._scale))
+
+ # Clamp
+ left, right = min(pil_x1, pil_x2), max(pil_x1, pil_x2)
+ upper, lower = min(pil_y1, pil_y2), max(pil_y1, pil_y2)
+
+ left = max(0, min(left, page_width))
+ right = max(0, min(right, page_width))
+ upper = max(0, min(upper, page_height))
+ lower = max(0, min(lower, page_height))
+
+ width = right - left
+ height = lower - upper
+
+ # validate dimensions before cropping
+ if not _is_valid_image_dimensions(width, height):
+ log.debug("Skipping image with invalid dimensions: %dx%d", width, height)
+ return None
+
+ return page_image.crop((left, upper, right, lower))
+
+ def cleanup(self) -> None:
+ """
+ Clean up processed images
+ """
+ for page_image in self._page_cache.values():
+ page_image.close()
+ self._page_cache.clear()
+
+
+class OCRProcessor:
+ def __init__(self, batch_size: int = IMAGE_BATCH_SIZE):
+ self.batch_size = batch_size
+
+ def process_images(self, images: list[ImageForOCR]) -> dict[int, str]:
+ """
+ Batch OCR images
+ """
+ if not images:
+ return {}
+ block_ids = [img.block_id for img in images]
+ messages = self._prepare_messages(images)
+ ocr_texts = self._execute_batch_ocr(messages)
+ return dict(zip(block_ids, ocr_texts, strict=True))
+
+ def _prepare_messages(self, images: list[ImageForOCR]) -> list[list[dict]]:
+ messages = []
+ for img in images:
+ image_uri = _image_to_base64_uri(img.pil_image)
+ img.pil_image.close()
+ messages.append(_build_ocr_message(image_uri, settings.OCR_PROMPT))
+ return messages
+
+ def _execute_batch_ocr(self, messages_list: list[list[dict]]) -> list[str]:
+ all_texts = []
+ for i in range(0, len(messages_list), self.batch_size):
+ batch = messages_list[i : i + self.batch_size]
+ responses = batch_completion(
+ custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER,
+ api_base=settings.LITELLM_API_BASE,
+ model=settings.OCR_MODEL,
+ messages=batch,
+ )
+ all_texts.extend([resp.choices[0].message.content for resp in responses])
+ gc.collect()
+ return all_texts
+
+
+class MarkdownAssembler:
+ """
+ Assembles markdown from parsed content blocks.
+
+ Handles all content types from OpenDataLoader JSON output:
+ - heading: Section headings with levels
+ - paragraph: Text paragraphs
+ - image: Embedded images (processed via OCR)
+ - table: Tables with rows and cells
+ - list: Ordered/unordered lists with items
+ - text chunk / text block: Raw text content
+ - line: Single lines of text
+ - list item: Individual list items (when standalone)
+ """
+
+ def assemble(self, blocks: list[ContentBlock], ocr_results: dict[int, str]) -> str:
+ formatted_parts = []
+ for block in blocks:
+ # Full Page OCR overrides all individual blocks on that page
+ if block.block_type == BlockType.FULL_PAGE_OCR:
+ formatted_parts.append(ocr_results.get(block.block_id, ""))
+ continue
+
+ # Standard formatting
+ text = self._format_block(block, ocr_results)
+ if text:
+ formatted_parts.append(text)
+ return "\n\n".join(formatted_parts)
+
+ def _format_block(
+ self, block: ContentBlock, ocr_results: dict[int, str]
+ ) -> str | None:
+ """
+ Format a content block to markdown.
+
+ Supports all content types from OpenDataLoader:
+ - heading: Markdown headings with # prefix
+ - paragraph: Plain text paragraphs
+ - image: OCR results or placeholder
+ - table: Markdown tables with | separators
+ - list: Markdown lists with - or 1. prefixes
+ - text chunk / text block / line: Plain text
+ - list item: Single list item (when standalone)
+ """
+ block_type = block.block_type
+ block_content = None
+ # Heading
+ if block_type == BlockType.HEADING and block.content:
+ level = min(block.heading_level or 1, 6)
+ block_content = f"{'#' * level} {block.content}"
+
+ # Paragraph
+ if block_type == BlockType.PARAGRAPH and block.content:
+ block_content = block.content
+
+ # Image - use OCR result
+ if block_type == BlockType.IMAGE:
+ block_content = ocr_results.get(block.block_id)
+
+ # Table
+ if block_type == BlockType.TABLE:
+ block_content = self._format_table(block)
+
+ # List
+ if block_type == BlockType.LIST:
+ block_content = self._format_list(block)
+
+ # Text chunk / text block / line - plain text content
+ if block_type in (BlockType.TEXT_CHUNK, BlockType.TEXT_BLOCK, BlockType.LINE):
+ block_content = block.content if block.content else None
+
+ # Standalone list item
+ if block_type == BlockType.LIST_ITEM and block.content:
+ block_content = f"- {block.content}"
+
+ # Table cell (standalone, unusual but possible)
+ if block_type == BlockType.TABLE_CELL and block.content:
+ block_content = block.content
+
+ # Table row (standalone, unusual but possible)
+ if block_type == BlockType.TABLE_ROW:
+ block_content = self._format_table_row_standalone(block)
+
+ return block_content
+
+ def _format_table(self, block: ContentBlock) -> str | None:
+ """
+ Format a table block to markdown table syntax.
+ """
+ if not block.rows:
+ # Try to extract table from nested kids structure
+ return self._format_table_from_kids(block)
+
+ lines = []
+ num_cols = block.number_of_columns or self._detect_column_count(block.rows)
+
+ for row_idx, row in enumerate(block.rows):
+ # Build row content
+ cells_content = []
+ for cell in row.cells:
+ cell_text = self._get_cell_content(cell)
+ # Replace newlines with HTML breaks for table cells
+ cell_text = cell_text.replace("\n", "
") if cell_text else ""
+ cells_content.append(cell_text)
+
+ # Pad if fewer cells than expected
+ while len(cells_content) < num_cols:
+ cells_content.append("")
+
+ row_line = "| " + " | ".join(cells_content) + " |"
+ lines.append(row_line)
+
+ # Add header separator after first row
+ if row_idx == 0:
+ separator = "| " + " | ".join(["---"] * num_cols) + " |"
+ lines.append(separator)
+
+ return "\n".join(lines) if lines else None
+
+ def _format_table_from_kids(self, block: ContentBlock) -> str | None:
+ """
+ Format table when structure is in 'kids' field rather than 'rows'.
+ """
+ if not block.kids:
+ return None
+
+ lines = []
+ num_cols = block.number_of_columns or 0
+ row_idx = 0
+
+ for kid in block.kids:
+ if isinstance(kid, dict) and kid.get("type") == BlockType.TABLE_ROW:
+ cells = kid.get("cells", [])
+ cells_content = []
+
+ for cell_data in cells:
+ if isinstance(cell_data, dict):
+ cell_text = cell_data.get("content", "")
+ # Handle nested kids in cell
+ if not cell_text and "kids" in cell_data:
+ cell_text = self._extract_text_from_kids(cell_data["kids"])
+ else:
+ cell_text = str(cell_data) if cell_data else ""
+
+ cell_text = cell_text.replace("\n", "
") if cell_text else ""
+ cells_content.append(cell_text)
+
+ if not num_cols:
+ num_cols = len(cells_content)
+
+ while len(cells_content) < num_cols:
+ cells_content.append("")
+
+ row_line = "| " + " | ".join(cells_content) + " |"
+ lines.append(row_line)
+
+ if row_idx == 0:
+ separator = "| " + " | ".join(["---"] * num_cols) + " |"
+ lines.append(separator)
+
+ row_idx += 1
+
+ return "\n".join(lines) if lines else None
+
+ def _format_table_row_standalone(self, block: ContentBlock) -> str | None:
+ """Format a standalone table row (unusual case)."""
+ if not hasattr(block, "kids") or not block.kids:
+ return block.content
+
+ cells_content = []
+ for cell in block.kids:
+ if isinstance(cell, dict):
+ cell_text = cell.get("content", "")
+ else:
+ cell_text = str(cell) if cell else ""
+ cells_content.append(cell_text.replace("\n", "
") if cell_text else "")
+
+ return "| " + " | ".join(cells_content) + " |" if cells_content else None
+
+ def _get_cell_content(self, cell: TableCell) -> str:
+ """Extract text content from a table cell."""
+ if cell.content:
+ return cell.content
+
+ # Check nested kids for content
+ if cell.kids:
+ return self._extract_text_from_kids(cell.kids)
+
+ return ""
+
+ def _extract_text_from_kids(self, kids: list[Any]) -> str:
+ """Recursively extract text from nested kids structure."""
+ texts = []
+ for kid in kids:
+ if isinstance(kid, dict):
+ if "content" in kid:
+ texts.append(kid["content"])
+ if "kids" in kid:
+ texts.append(self._extract_text_from_kids(kid["kids"]))
+ elif isinstance(kid, str):
+ texts.append(kid)
+ return " ".join(filter(None, texts))
+
+ def _detect_column_count(self, rows: list[TableRow]) -> int:
+ """Detect number of columns from rows."""
+ if not rows:
+ return 0
+ return max(len(row.cells) for row in rows)
+
+ def _format_list(self, block: ContentBlock) -> str | None:
+ """
+ Format a list block to markdown list syntax.
+
+ Supports ordered (numbered) and unordered (bullet) lists.
+ """
+ if not block.list_items and not block.kids:
+ return None
+
+ lines = []
+ is_ordered = self._is_ordered_list(block.numbering_style)
+
+ # Use list_items if available, otherwise parse from kids
+ items = (
+ block.list_items
+ if block.list_items
+ else self._parse_list_items_from_kids(block.kids)
+ )
+
+ for idx, item in enumerate(items, start=1):
+ item_content = self._get_list_item_content(item)
+ if item_content:
+ if is_ordered:
+ lines.append(f"{idx}. {item_content}")
+ else:
+ lines.append(f"- {item_content}")
+
+ return "\n".join(lines) if lines else None
+
+ def _is_ordered_list(self, numbering_style: str | None) -> bool:
+ """Determine if list should be ordered based on numbering style."""
+ if not numbering_style:
+ return False
+ # Common ordered list indicators
+ ordered_indicators = ["decimal", "number", "alpha", "roman", "1", "a", "i"]
+ return any(ind in numbering_style.lower() for ind in ordered_indicators)
+
+ def _parse_list_items_from_kids(self, kids: list[Any]) -> list[ListItem]:
+ """Parse list items from kids structure."""
+
+ return [
+ ListItem(content=kid.get("content"), kids=kid.get("kids", []))
+ for kid in kids
+ if isinstance(kid, dict) and kid.get("type") == BlockType.LIST_ITEM
+ ]
+
+ def _get_list_item_content(self, item: ListItem) -> str:
+ """Extract text content from a list item."""
+ if item.content:
+ return item.content
+
+ if item.kids:
+ return self._extract_text_from_kids(item.kids)
+
+ return ""
+
+
+class OpenDataLoaderLLMConverter:
+ def __init__(
+ self,
+ document_path: Path,
+ output_dir: Path | None = None,
+ pdf_dpi: int = 150,
+ *,
+ debug_mode=False,
+ ):
+ self.document_path = Path(document_path)
+ self.debug_mode = debug_mode
+ self._tempdir = tempfile.TemporaryDirectory()
+ self.output_dir = output_dir or Path(self._tempdir.name)
+ self._page_renderer = PDFPageRenderer(self.document_path, dpi=pdf_dpi)
+ self._ocr_processor = OCRProcessor()
+ self._markdown_assembler = MarkdownAssembler()
+
+ if debug_mode:
+ litellm._turn_on_debug() # noqa: SLF001
+
+ @property
+ def _debug_dir(self) -> Path:
+ d = Path(settings.OCR_DEBUG_DIRECTORY) / self.document_path.stem
+ d.mkdir(parents=True, exist_ok=True)
+ return d
+
+ def _save_debug_image(self, pil_image: Image.Image, prefix: str = "") -> str:
+ fp = self._debug_dir / f"{prefix}{self.document_path.name}-{uuid.uuid4()}.png"
+ pil_image.save(fp)
+ return str(fp)
+
+ def _save_debug_markdown(self, markdown_content: str) -> str:
+ fp = self._debug_dir / f"{self.document_path.name}.md"
+ fp.write_text(markdown_content)
+ return str(fp)
+
+ def _convert_pdf_to_json(self) -> dict[str, Any]:
+ self.output_dir.mkdir(parents=True, exist_ok=True)
+ opendataloader_pdf.convert(
+ input_path=[str(self.document_path)],
+ output_dir=str(self.output_dir),
+ format="json",
+ use_struct_tree=True,
+ keep_line_breaks=True,
+ )
+ return json.loads(
+ (self.output_dir / f"{self.document_path.stem}.json").read_text()
+ )
+
+ def _parse_content_block(self, block_data: dict[str, Any]) -> ContentBlock:
+ """
+ Parse a content block from JSON data.
+
+ Handles all content types including nested structures for tables and lists.
+ """
+ block_type = block_data.get("type", "unknown")
+
+ # Parse table rows if present
+ rows = []
+ if block_type == BlockType.TABLE and "rows" in block_data:
+ rows = self._parse_table_rows(block_data["rows"])
+
+ # Parse list items if present
+ list_items = []
+ if block_type == BlockType.LIST and "list items" in block_data:
+ list_items = self._parse_list_items(block_data["list items"])
+
+ return ContentBlock(
+ block_type=block_type,
+ block_id=block_data.get("id", 0),
+ page_number=block_data.get("page number", 1),
+ bounding_box=block_data.get("bounding box", [0, 0, 0, 0]),
+ content=block_data.get("content"),
+ heading_level=block_data.get("heading level"),
+ font=block_data.get("font"),
+ rows=rows,
+ number_of_rows=block_data.get("number of rows", 0),
+ number_of_columns=block_data.get("number of columns", 0),
+ list_items=list_items,
+ numbering_style=block_data.get("numbering style"),
+ kids=block_data.get("kids", []),
+ )
+
+ def _parse_table_rows(self, rows_data: list[dict]) -> list[TableRow]:
+ """Parse table rows from JSON data."""
+ rows = []
+ for row_data in rows_data:
+ cells = [
+ TableCell(
+ content=cell_data.get("content"),
+ column_number=cell_data.get("column number", 0),
+ row_number=cell_data.get("row number", 0),
+ column_span=cell_data.get("column span", 1),
+ row_span=cell_data.get("row span", 1),
+ kids=cell_data.get("kids", []),
+ )
+ for cell_data in row_data.get("cells", [])
+ ]
+
+ rows.append(
+ TableRow(
+ row_number=row_data.get("row number", 0),
+ cells=cells,
+ )
+ )
+ return rows
+
+ def _parse_list_items(self, items_data: list[dict]) -> list[ListItem]:
+ """Parse list items from JSON data."""
+ return [
+ ListItem(
+ content=item_data.get("content"),
+ kids=item_data.get("kids", []),
+ )
+ for item_data in items_data
+ ]
+
+ def _calculate_page_math_score(self, blocks: list[ContentBlock]) -> int:
+ """
+ Calculate a score representing 'math density' for a page.
+ """
+ score = 0
+ for b in blocks:
+ if b.font:
+ font_base = "".join([c for c in b.font.lower() if c.isalpha()])
+ if any(mf in font_base for mf in MATH_FONTS):
+ score += 1
+
+ if b.content:
+ text = b.content.strip()
+ if COMPLEX_MATH_REGEX.search(text):
+ score += 3
+ elif BASIC_MATH_SYMBOLS.search(text):
+ score += 1
+ return score
+
+ def convert_to_markdown(self) -> str:
+ try:
+ # get document elements and structure as json
+ json_data = self._convert_pdf_to_json()
+ raw_blocks = [
+ self._parse_content_block(k) for k in json_data.get("kids", [])
+ ]
+
+ # Group by Page
+ pages = defaultdict(list)
+ for b in raw_blocks:
+ pages[b.page_number].append(b)
+
+ final_blocks = []
+ images_for_ocr = []
+
+ # Process Per Page
+ for page_num in sorted(pages.keys()):
+ page_blocks = pages[page_num]
+
+ math_score = self._calculate_page_math_score(page_blocks)
+ should_full_ocr = math_score > settings.OCR_MATH_DENSITY_THRESHOLD
+
+ if should_full_ocr:
+ log.info(
+ "Page %d: High Math Score (%f).Strategy: Full Page OCR.",
+ page_num,
+ math_score,
+ )
+
+ page_block_id = 888000 + page_num
+
+ full_page_img = self._page_renderer.get_page_image(page_num)
+ optimized_img = _optimize_image(full_page_img)
+
+ images_for_ocr.append(
+ ImageForOCR(page_block_id, optimized_img, is_full_page=True)
+ )
+
+ final_blocks.append(
+ ContentBlock(
+ block_type=BlockType.FULL_PAGE_OCR,
+ block_id=page_block_id,
+ page_number=page_num,
+ bounding_box=[],
+ )
+ )
+
+ if self.debug_mode:
+ self._save_debug_image(
+ optimized_img, prefix=f"FULLPAGE_{page_num}_"
+ )
+
+ else:
+ log.info(
+ "Page %d: Low Math Score (%f).Strategy: Standard Parse.",
+ page_num,
+ math_score,
+ )
+
+ for block in page_blocks:
+ final_blocks.append(block)
+
+ if block.block_type == BlockType.IMAGE:
+ img = self._page_renderer.extract_region(
+ block.page_number, block.bounding_box
+ )
+ if img:
+ opt = _optimize_image(img)
+ images_for_ocr.append(
+ ImageForOCR(block.block_id, opt, is_full_page=False)
+ )
+ if self.debug_mode:
+ self._save_debug_image(
+ opt, prefix=f"IMG_{block.block_id}_"
+ )
+
+ # Batch OCR
+ ocr_results = self._ocr_processor.process_images(images_for_ocr)
+
+ # Assemble
+ final_md = self._markdown_assembler.assemble(final_blocks, ocr_results)
+
+ if self.debug_mode:
+ self._save_debug_markdown(final_md)
+
+ return final_md
+
+ finally:
+ self._page_renderer.cleanup()
diff --git a/learning_resources/converters/opendataloader_llm_converter_test.py b/learning_resources/converters/opendataloader_llm_converter_test.py
new file mode 100644
index 0000000000..8133a63ee7
--- /dev/null
+++ b/learning_resources/converters/opendataloader_llm_converter_test.py
@@ -0,0 +1,106 @@
+from pathlib import Path
+
+import pytest
+
+from learning_resources.converters.opendataloader_llm_converter import (
+ ImageForOCR,
+ OpenDataLoaderLLMConverter,
+)
+
+
+@pytest.fixture
+def fake_renderer(mocker):
+ renderer = mocker.MagicMock()
+ renderer.render_page.return_value = mocker.MagicMock(name="page_image")
+ renderer.cleanup = mocker.MagicMock()
+ return renderer
+
+
+@pytest.fixture
+def fake_ocr(mocker):
+ ocr = mocker.MagicMock()
+ ocr.ocr_image.return_value = "OCR_TEXT"
+ return ocr
+
+
+@pytest.fixture(autouse=True)
+def mock_litellm(mocker):
+ mocker.patch(
+ "learning_resources.converters.opendataloader_llm_converter.litellm.completion",
+ return_value={"choices": [{"message": {"content": "OCR TEXT"}}]},
+ )
+ mocker.patch(
+ "learning_resources.converters.opendataloader_llm_converter.OCRProcessor._execute_batch_ocr",
+ return_value=["OCR TEXT"],
+ )
+
+
+def test_basic_conversion(settings, fake_renderer, mocker):
+ """
+ Test a very basic conversion of pdf to markdown
+ """
+ settings.OCR_MODEL = "test"
+ sample_pdf = Path("test_pdfs/notes.pdf")
+ mocker.patch(
+ "learning_resources.converters.opendataloader_llm_converter._optimize_image",
+ return_value=mocker.MagicMock(),
+ )
+ mocker.patch(
+ "learning_resources.converters.opendataloader_llm_converter.PDFPageRenderer",
+ return_value=fake_renderer,
+ )
+ converter = OpenDataLoaderLLMConverter(document_path=sample_pdf, debug_mode=False)
+ markdown = converter.convert_to_markdown()
+ assert isinstance(markdown, str)
+ assert "OCR TEXT" in markdown
+
+
+def test_debug_images_written(tmp_path, mocker, settings, fake_renderer):
+ """
+ Test debug_mode flag outputs debug images
+ """
+ settings.OCR_MODEL = "test"
+ mocker.patch(
+ "learning_resources.converters.opendataloader_llm_converter.settings.OCR_DEBUG_DIRECTORY",
+ tmp_path,
+ )
+ mocker.patch(
+ "learning_resources.converters.opendataloader_llm_converter._optimize_image",
+ return_value=mocker.MagicMock(),
+ )
+ mocker.patch(
+ "learning_resources.converters.opendataloader_llm_converter.PDFPageRenderer",
+ return_value=fake_renderer,
+ )
+ sample_pdf = Path("test_pdfs/notes.pdf")
+ converter = OpenDataLoaderLLMConverter(sample_pdf, debug_mode=True)
+ converter.convert_to_markdown()
+ assert len(list(tmp_path.glob("*notes"))) > 0
+
+
+def test_tiny_images_are_skipped(fake_renderer, fake_ocr, mocker):
+ """
+ Test that small images or images with odd dimensions are skipped
+ """
+ with (
+ mocker.patch(
+ "learning_resources.converters.opendataloader_llm_converter.PDFPageRenderer",
+ return_value=fake_renderer,
+ ),
+ mocker.patch(
+ "learning_resources.converters.opendataloader_llm_converter.OCRProcessor",
+ return_value=fake_ocr,
+ ),
+ ):
+ sample_pdf = Path("test_pdfs/notes.pdf")
+ conv = OpenDataLoaderLLMConverter(sample_pdf)
+
+ tiny = ImageForOCR(
+ pil_image=mocker.MagicMock(size=(41, 5)),
+ is_full_page=False,
+ block_id="tiny",
+ )
+
+ conv._ocr_processor.process_images([tiny]) # noqa: SLF001
+
+ fake_ocr.ocr_image.assert_not_called()
diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 277d27d540..86455005df 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -158,7 +158,9 @@ def _generate_content():
else:
log.debug("skipping unpublished file %s", member.filename)
- for content_data in process_olx_path(olx_path, run, overwrite=overwrite):
+ for content_data in process_olx_path(
+ olx_path, run, overwrite=overwrite, use_ocr=True
+ ):
url_path = content_data["source_path"].lstrip(
content_data["source_path"].split("/")[0]
)
@@ -209,6 +211,7 @@ def transform_canvas_problem_files(
overwrite=overwrite,
valid_file_types=VALID_TUTOR_PROBLEM_FILE_TYPES,
is_tutor_problem_file_import=True,
+ use_ocr=True,
):
keys_to_keep = [
"run",
diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
index 710f80c853..3340ee2821 100644
--- a/learning_resources/etl/canvas_test.py
+++ b/learning_resources/etl/canvas_test.py
@@ -96,6 +96,11 @@
"""
+@pytest.fixture
+def sample_pdf_content():
+ return Path("test_pdfs/notes.pdf").read_bytes()
+
+
@pytest.fixture(autouse=True)
def canvas_platform():
"""Fixture for the canvas platform"""
@@ -443,8 +448,8 @@ def test_transform_canvas_content_files_removes_unpublished_content(mocker, tmp_
@pytest.mark.parametrize("overwrite", [True, False])
@pytest.mark.parametrize("existing_file", [True, False])
-def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown(
- tmp_path, mocker, settings, overwrite, existing_file
+def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown( # noqa: PLR0913
+ tmp_path, mocker, settings, overwrite, existing_file, sample_pdf_content
):
"""
Test that transform_canvas_problem_files calls _pdf_to_markdown for PDF files.
@@ -452,11 +457,12 @@ def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown(
"""
settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
- settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
+ settings.OCR_MODEL = "fake-model"
+
pdf_filename = "problemset1/problem.pdf"
- pdf_content = b"%PDF-1.4 fake pdf content"
+
zip_path = make_canvas_zip(
- tmp_path, files=[(f"tutorbot/{pdf_filename}", pdf_content)]
+ tmp_path, files=[(f"tutorbot/{pdf_filename}", sample_pdf_content)]
)
# return a file with pdf extension
@@ -524,7 +530,7 @@ def test_transform_canvas_problem_files_non_pdf_does_not_call_pdf_to_markdown(
there is an existing file.
"""
settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
- settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
+ settings.OCR_MODEL = "fake-model"
csv_filename = "problemset2/problem.csv"
csv_content = "a,b,c\n1,2,3"
zip_path = make_canvas_zip(
@@ -1031,7 +1037,7 @@ def test_parse_files_meta_excludes_tutorbot_folder(tmp_path, settings):
assert result["unpublished"][0]["path"].name == "tutorfile.html"
-def test_embedded_files_from_html(tmp_path, mocker):
+def test_embedded_files_from_html(tmp_path, mocker, sample_pdf_content):
"""
Test that _embedded_files_from_html processes files embedded in HTML content
even if they are not in modules_meta.xml or files_meta.xml.
@@ -1083,7 +1089,7 @@ def test_embedded_files_from_html(tmp_path, mocker):
tmp_path, module_xml=module_xml, manifest_xml=manifest_xml
)
with zipfile.ZipFile(zip_path, "a") as zf:
- zf.writestr("web_resources/file1.pdf", "content of file1")
+ zf.writestr("web_resources/file1.pdf", sample_pdf_content)
zf.writestr("web_resources/file2.html", "content of file2")
zf.writestr("web_resources/html_page.html", html_content)
@@ -1742,7 +1748,9 @@ def test_get_published_items_for_attachment_module(mocker, tmp_path):
assert Path("web_resources/visible_attachment_module.txt").resolve() in published
-def test_ingestion_finishes_with_missing_xml_files(tmp_path, mocker):
+def test_ingestion_finishes_with_missing_xml_files(
+ tmp_path, mocker, sample_pdf_content
+):
"""
Test that canvas course ingestion succeeds even if some config XML files are missing
"""
@@ -1787,7 +1795,7 @@ def test_ingestion_finishes_with_missing_xml_files(tmp_path, mocker):
manifest_xml=manifest_xml,
files=[
("course_settings/files_meta.xml", files_xml),
- ("web_resources/file1.pdf", "content of file1"),
+ ("web_resources/file1.pdf", sample_pdf_content),
("web_resources/file2.html", "content of file2"),
("web_resources/html_page.html", ""),
],
@@ -1804,3 +1812,75 @@ def test_ingestion_finishes_with_missing_xml_files(tmp_path, mocker):
)
assert run is not None
assert len(content_results) > 0
+
+
+@pytest.mark.parametrize("overwrite", [True, False])
+@pytest.mark.parametrize("existing_file", [True, False])
+def test_transform_canvas_problem_files_pdf_calls_llm_for_ocr( # noqa: PLR0913
+ tmp_path, mocker, settings, overwrite, existing_file, sample_pdf_content
+):
+ """
+ Test that transform_canvas_problem_files calls _pdf_to_markdown for PDF files.
+ if overwrite is True or there is no existing file. Tikka should not be called
+ """
+
+ settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
+ settings.OCR_MODEL = "test model"
+
+ pdf_filename = "problemset1/problem.pdf"
+
+ zip_path = make_canvas_zip(
+ tmp_path, files=[(f"tutorbot/{pdf_filename}", sample_pdf_content)]
+ )
+
+ # return a file with pdf extension
+ fake_file_data = {
+ "run": "run",
+ "content_type": "application/pdf",
+ "archive_checksum": "checksum",
+ "source_path": f"tutorbot/{pdf_filename}",
+ "file_extension": ".pdf",
+ }
+
+ mocker.patch(
+ "learning_resources.etl.utils.documents_from_olx",
+ return_value=iter([[mocker.Mock(), fake_file_data]]),
+ )
+
+ # Patch _pdf_to_markdown to return a known value
+ pdf_to_md = mocker.patch(
+ "learning_resources.etl.utils._pdf_to_markdown",
+ return_value="markdown content from pdf",
+ )
+
+ tika = mocker.patch(
+ "learning_resources.etl.utils.extract_text_metadata",
+ )
+
+ run = LearningResourceRunFactory.create()
+
+ if existing_file:
+ TutorProblemFileFactory.create(
+ run=run,
+ type="problem",
+ archive_checksum="checksum",
+ source_path=f"tutorbot/{pdf_filename}",
+ content="existing content",
+ file_name="problem1.pdf",
+ )
+
+ results = list(transform_canvas_problem_files(zip_path, run, overwrite=overwrite))
+
+ if overwrite or not existing_file:
+ pdf_to_md.assert_called_once()
+ else:
+ pdf_to_md.assert_not_called()
+
+ tika.assert_not_called()
+
+ assert (
+ results[0]["content"] == "markdown content from pdf"
+ if overwrite or not existing_file
+ else "existing content"
+ )
+ assert results[0]["problem_title"] == "problemset1"
diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py
index 7cbdf0e73c..7c42ad0eb9 100644
--- a/learning_resources/etl/utils.py
+++ b/learning_resources/etl/utils.py
@@ -29,9 +29,9 @@
from django.conf import settings
from django.utils.dateparse import parse_duration
from django.utils.text import slugify
-from litellm import completion
from PIL import Image
from pycountry import currencies
+from pypdf import PdfReader
from tika import parser as tika_parser
from learning_resources.constants import (
@@ -46,6 +46,9 @@
OfferedBy,
RunStatus,
)
+from learning_resources.converters.opendataloader_llm_converter import (
+ OpenDataLoaderLLMConverter,
+)
from learning_resources.etl.constants import (
RESOURCE_DELIVERY_MAPPING,
TIME_INTERVAL_MAPPING,
@@ -534,13 +537,14 @@ def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict:
return video_transcript_mapping
-def process_olx_path(
+def process_olx_path( # noqa: PLR0913
olx_path: str,
run: LearningResourceRun,
*,
overwrite,
valid_file_types=VALID_TEXT_FILE_TYPES,
is_tutor_problem_file_import=False,
+ use_ocr=False,
) -> Generator[dict, None, None]:
video_srt_metadata = get_video_metadata(olx_path, run)
for document, metadata in documents_from_olx(
@@ -571,8 +575,13 @@ def process_olx_path(
}
elif (
file_extension == ".pdf"
- and is_tutor_problem_file_import
- and settings.CANVAS_PDF_TRANSCRIPTION_MODEL
+ and use_ocr
+ and settings.OCR_MODEL
+ and (
+ len(PdfReader(Path(olx_path) / Path(source_path)).pages)
+ <= settings.OCR_PDF_MAX_PAGE_THRESHOLD
+ or is_tutor_problem_file_import
+ )
):
markdown_content = _pdf_to_markdown(Path(olx_path) / Path(source_path))
content_dict = {
@@ -1072,38 +1081,8 @@ def _pdf_to_markdown(pdf_path):
"""
Convert a PDF file to markdown using an llm
"""
- markdown = ""
- for im in pdf_to_base64_images(pdf_path):
- response = completion(
- api_base=settings.LITELLM_API_BASE,
- custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER,
- model=settings.CANVAS_PDF_TRANSCRIPTION_MODEL,
- messages=[
- {
- "role": "user",
- "content": [
- {
- "type": "text",
- "text": settings.CANVAS_TRANSCRIPTION_PROMPT,
- },
- {
- "type": "image_url",
- "image_url": {
- "url": f"data:image/jpeg;base64,{im}",
- },
- },
- ],
- }
- ],
- )
- markdown_snippet = (
- response.json()["choices"][0]["message"]["content"]
- .removeprefix("```markdown\n")
- .removesuffix("\n```")
- )
-
- markdown += markdown_snippet
- return markdown
+ converter = OpenDataLoaderLLMConverter(pdf_path)
+ return converter.convert_to_markdown()
def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
diff --git a/main/settings_course_etl.py b/main/settings_course_etl.py
index 6f70c4d6c4..7b01bd504b 100644
--- a/main/settings_course_etl.py
+++ b/main/settings_course_etl.py
@@ -68,14 +68,27 @@
CANVAS_COURSE_BUCKET_PREFIX = get_string(
"CANVAS_COURSE_BUCKET_PREFIX", "canvas/course_content"
)
-CANVAS_PDF_TRANSCRIPTION_MODEL = get_string(
- name="CANVAS_PDF_TRANSCRIPTION_MODEL", default=None
-)
-CANVAS_TRANSCRIPTION_PROMPT = get_string(
- "CANVAS_TRANSCRIPTION_PROMPT",
- """Transcribe the contents of this file into markdown.
- Do not include anything but the markdown content in your response""",
+
+OCR_MODEL = get_string(name="OCR_MODEL", default=None)
+
+OCR_PROMPT = get_string(
+ "OCR_PROMPT",
+ "Transcribe this image to markdown. Properly format text, formulas, "
+ "tables, and code into markdown format."
+ "Include a markdown comment for elements that cannot be transcribed such as "
+ "photos and other visual elements."
+ "Do not include any indications your output is the result of a transcription."
+ "Do not include extra commentary - "
+ "ONLY include the resulting transcribed markdown.",
)
+# Do not OCR if the PDF has exceeds this many pages
+OCR_PDF_MAX_PAGE_THRESHOLD = get_int(name="OCR_PDF_MAX_PAGE_THRESHOLD", default=10)
+
+# OCR the entire page if the density of math formulas exceeds this threshold
+OCR_MATH_DENSITY_THRESHOLD = get_int(name="OCR_MATH_DENSITY_THRESHOLD", default=5)
+OCR_DEBUG_DIRECTORY = get_string(name="OCR_DEBUG_DIRECTORY", default="ocr_debug")
+
+
# More MIT URLs
SEE_API_URL = get_string("SEE_API_URL", None)
SEE_API_ACCESS_TOKEN_URL = get_string("SEE_API_ACCESS_TOKEN_URL", None)
diff --git a/poetry.lock b/poetry.lock
index bd745467ab..b5374f27bc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4977,86 +4977,67 @@ six = ">=1.9.0"
[[package]]
name = "numpy"
-version = "2.3.5"
+version = "2.2.6"
description = "Fundamental package for array computing in Python"
optional = false
-python-versions = ">=3.11"
+python-versions = ">=3.10"
groups = ["main"]
files = [
- {file = "numpy-2.3.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:de5672f4a7b200c15a4127042170a694d4df43c992948f5e1af57f0174beed10"},
- {file = "numpy-2.3.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:acfd89508504a19ed06ef963ad544ec6664518c863436306153e13e94605c218"},
- {file = "numpy-2.3.5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ffe22d2b05504f786c867c8395de703937f934272eb67586817b46188b4ded6d"},
- {file = "numpy-2.3.5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:872a5cf366aec6bb1147336480fef14c9164b154aeb6542327de4970282cd2f5"},
- {file = "numpy-2.3.5-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3095bdb8dd297e5920b010e96134ed91d852d81d490e787beca7e35ae1d89cf7"},
- {file = "numpy-2.3.5-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8cba086a43d54ca804ce711b2a940b16e452807acebe7852ff327f1ecd49b0d4"},
- {file = "numpy-2.3.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6cf9b429b21df6b99f4dee7a1218b8b7ffbbe7df8764dc0bd60ce8a0708fed1e"},
- {file = "numpy-2.3.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:396084a36abdb603546b119d96528c2f6263921c50df3c8fd7cb28873a237748"},
- {file = "numpy-2.3.5-cp311-cp311-win32.whl", hash = "sha256:b0c7088a73aef3d687c4deef8452a3ac7c1be4e29ed8bf3b366c8111128ac60c"},
- {file = "numpy-2.3.5-cp311-cp311-win_amd64.whl", hash = "sha256:a414504bef8945eae5f2d7cb7be2d4af77c5d1cb5e20b296c2c25b61dff2900c"},
- {file = "numpy-2.3.5-cp311-cp311-win_arm64.whl", hash = "sha256:0cd00b7b36e35398fa2d16af7b907b65304ef8bb4817a550e06e5012929830fa"},
- {file = "numpy-2.3.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:74ae7b798248fe62021dbf3c914245ad45d1a6b0cb4a29ecb4b31d0bfbc4cc3e"},
- {file = "numpy-2.3.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ee3888d9ff7c14604052b2ca5535a30216aa0a58e948cdd3eeb8d3415f638769"},
- {file = "numpy-2.3.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:612a95a17655e213502f60cfb9bf9408efdc9eb1d5f50535cc6eb365d11b42b5"},
- {file = "numpy-2.3.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3101e5177d114a593d79dd79658650fe28b5a0d8abeb8ce6f437c0e6df5be1a4"},
- {file = "numpy-2.3.5-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b973c57ff8e184109db042c842423ff4f60446239bd585a5131cc47f06f789d"},
- {file = "numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d8163f43acde9a73c2a33605353a4f1bc4798745a8b1d73183b28e5b435ae28"},
- {file = "numpy-2.3.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:51c1e14eb1e154ebd80e860722f9e6ed6ec89714ad2db2d3aa33c31d7c12179b"},
- {file = "numpy-2.3.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b46b4ec24f7293f23adcd2d146960559aaf8020213de8ad1909dba6c013bf89c"},
- {file = "numpy-2.3.5-cp312-cp312-win32.whl", hash = "sha256:3997b5b3c9a771e157f9aae01dd579ee35ad7109be18db0e85dbdbe1de06e952"},
- {file = "numpy-2.3.5-cp312-cp312-win_amd64.whl", hash = "sha256:86945f2ee6d10cdfd67bcb4069c1662dd711f7e2a4343db5cecec06b87cf31aa"},
- {file = "numpy-2.3.5-cp312-cp312-win_arm64.whl", hash = "sha256:f28620fe26bee16243be2b7b874da327312240a7cdc38b769a697578d2100013"},
- {file = "numpy-2.3.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d0f23b44f57077c1ede8c5f26b30f706498b4862d3ff0a7298b8411dd2f043ff"},
- {file = "numpy-2.3.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:aa5bc7c5d59d831d9773d1170acac7893ce3a5e130540605770ade83280e7188"},
- {file = "numpy-2.3.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ccc933afd4d20aad3c00bcef049cb40049f7f196e0397f1109dba6fed63267b0"},
- {file = "numpy-2.3.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:afaffc4393205524af9dfa400fa250143a6c3bc646c08c9f5e25a9f4b4d6a903"},
- {file = "numpy-2.3.5-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c75442b2209b8470d6d5d8b1c25714270686f14c749028d2199c54e29f20b4d"},
- {file = "numpy-2.3.5-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11e06aa0af8c0f05104d56450d6093ee639e15f24ecf62d417329d06e522e017"},
- {file = "numpy-2.3.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed89927b86296067b4f81f108a2271d8926467a8868e554eaf370fc27fa3ccaf"},
- {file = "numpy-2.3.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51c55fe3451421f3a6ef9a9c1439e82101c57a2c9eab9feb196a62b1a10b58ce"},
- {file = "numpy-2.3.5-cp313-cp313-win32.whl", hash = "sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e"},
- {file = "numpy-2.3.5-cp313-cp313-win_amd64.whl", hash = "sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b"},
- {file = "numpy-2.3.5-cp313-cp313-win_arm64.whl", hash = "sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae"},
- {file = "numpy-2.3.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:414802f3b97f3c1eef41e530aaba3b3c1620649871d8cb38c6eaff034c2e16bd"},
- {file = "numpy-2.3.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5ee6609ac3604fa7780e30a03e5e241a7956f8e2fcfe547d51e3afa5247ac47f"},
- {file = "numpy-2.3.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:86d835afea1eaa143012a2d7a3f45a3adce2d7adc8b4961f0b362214d800846a"},
- {file = "numpy-2.3.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:30bc11310e8153ca664b14c5f1b73e94bd0503681fcf136a163de856f3a50139"},
- {file = "numpy-2.3.5-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1062fde1dcf469571705945b0f221b73928f34a20c904ffb45db101907c3454e"},
- {file = "numpy-2.3.5-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce581db493ea1a96c0556360ede6607496e8bf9b3a8efa66e06477267bc831e9"},
- {file = "numpy-2.3.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:cc8920d2ec5fa99875b670bb86ddeb21e295cb07aa331810d9e486e0b969d946"},
- {file = "numpy-2.3.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:9ee2197ef8c4f0dfe405d835f3b6a14f5fee7782b5de51ba06fb65fc9b36e9f1"},
- {file = "numpy-2.3.5-cp313-cp313t-win32.whl", hash = "sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3"},
- {file = "numpy-2.3.5-cp313-cp313t-win_amd64.whl", hash = "sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234"},
- {file = "numpy-2.3.5-cp313-cp313t-win_arm64.whl", hash = "sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7"},
- {file = "numpy-2.3.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:bf06bc2af43fa8d32d30fae16ad965663e966b1a3202ed407b84c989c3221e82"},
- {file = "numpy-2.3.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:052e8c42e0c49d2575621c158934920524f6c5da05a1d3b9bab5d8e259e045f0"},
- {file = "numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:1ed1ec893cff7040a02c8aa1c8611b94d395590d553f6b53629a4461dc7f7b63"},
- {file = "numpy-2.3.5-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:2dcd0808a421a482a080f89859a18beb0b3d1e905b81e617a188bd80422d62e9"},
- {file = "numpy-2.3.5-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:727fd05b57df37dc0bcf1a27767a3d9a78cbbc92822445f32cc3436ba797337b"},
- {file = "numpy-2.3.5-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fffe29a1ef00883599d1dc2c51aa2e5d80afe49523c261a74933df395c15c520"},
- {file = "numpy-2.3.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8f7f0e05112916223d3f438f293abf0727e1181b5983f413dfa2fefc4098245c"},
- {file = "numpy-2.3.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2e2eb32ddb9ccb817d620ac1d8dae7c3f641c1e5f55f531a33e8ab97960a75b8"},
- {file = "numpy-2.3.5-cp314-cp314-win32.whl", hash = "sha256:66f85ce62c70b843bab1fb14a05d5737741e74e28c7b8b5a064de10142fad248"},
- {file = "numpy-2.3.5-cp314-cp314-win_amd64.whl", hash = "sha256:e6a0bc88393d65807d751a614207b7129a310ca4fe76a74e5c7da5fa5671417e"},
- {file = "numpy-2.3.5-cp314-cp314-win_arm64.whl", hash = "sha256:aeffcab3d4b43712bb7a60b65f6044d444e75e563ff6180af8f98dd4b905dfd2"},
- {file = "numpy-2.3.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:17531366a2e3a9e30762c000f2c43a9aaa05728712e25c11ce1dbe700c53ad41"},
- {file = "numpy-2.3.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d21644de1b609825ede2f48be98dfde4656aefc713654eeee280e37cadc4e0ad"},
- {file = "numpy-2.3.5-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:c804e3a5aba5460c73955c955bdbd5c08c354954e9270a2c1565f62e866bdc39"},
- {file = "numpy-2.3.5-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:cc0a57f895b96ec78969c34f682c602bf8da1a0270b09bc65673df2e7638ec20"},
- {file = "numpy-2.3.5-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:900218e456384ea676e24ea6a0417f030a3b07306d29d7ad843957b40a9d8d52"},
- {file = "numpy-2.3.5-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:09a1bea522b25109bf8e6f3027bd810f7c1085c64a0c7ce050c1676ad0ba010b"},
- {file = "numpy-2.3.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:04822c00b5fd0323c8166d66c701dc31b7fbd252c100acd708c48f763968d6a3"},
- {file = "numpy-2.3.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d6889ec4ec662a1a37eb4b4fb26b6100841804dac55bd9df579e326cdc146227"},
- {file = "numpy-2.3.5-cp314-cp314t-win32.whl", hash = "sha256:93eebbcf1aafdf7e2ddd44c2923e2672e1010bddc014138b229e49725b4d6be5"},
- {file = "numpy-2.3.5-cp314-cp314t-win_amd64.whl", hash = "sha256:c8a9958e88b65c3b27e22ca2a076311636850b612d6bbfb76e8d156aacde2aaf"},
- {file = "numpy-2.3.5-cp314-cp314t-win_arm64.whl", hash = "sha256:6203fdf9f3dc5bdaed7319ad8698e685c7a3be10819f41d32a0723e611733b42"},
- {file = "numpy-2.3.5-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f0963b55cdd70fad460fa4c1341f12f976bb26cb66021a5580329bd498988310"},
- {file = "numpy-2.3.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:f4255143f5160d0de972d28c8f9665d882b5f61309d8362fdd3e103cf7bf010c"},
- {file = "numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:a4b9159734b326535f4dd01d947f919c6eefd2d9827466a696c44ced82dfbc18"},
- {file = "numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:2feae0d2c91d46e59fcd62784a3a83b3fb677fead592ce51b5a6fbb4f95965ff"},
- {file = "numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ffac52f28a7849ad7576293c0cb7b9f08304e8f7d738a8cb8a90ec4c55a998eb"},
- {file = "numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63c0e9e7eea69588479ebf4a8a270d5ac22763cc5854e9a7eae952a3908103f7"},
- {file = "numpy-2.3.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f16417ec91f12f814b10bafe79ef77e70113a2f5f7018640e7425ff979253425"},
- {file = "numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0"},
+ {file = "numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb"},
+ {file = "numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90"},
+ {file = "numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163"},
+ {file = "numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf"},
+ {file = "numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83"},
+ {file = "numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915"},
+ {file = "numpy-2.2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680"},
+ {file = "numpy-2.2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289"},
+ {file = "numpy-2.2.6-cp310-cp310-win32.whl", hash = "sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d"},
+ {file = "numpy-2.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3"},
+ {file = "numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae"},
+ {file = "numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a"},
+ {file = "numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42"},
+ {file = "numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491"},
+ {file = "numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a"},
+ {file = "numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf"},
+ {file = "numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1"},
+ {file = "numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab"},
+ {file = "numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47"},
+ {file = "numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303"},
+ {file = "numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff"},
+ {file = "numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c"},
+ {file = "numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3"},
+ {file = "numpy-2.2.6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282"},
+ {file = "numpy-2.2.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87"},
+ {file = "numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249"},
+ {file = "numpy-2.2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49"},
+ {file = "numpy-2.2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de"},
+ {file = "numpy-2.2.6-cp312-cp312-win32.whl", hash = "sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4"},
+ {file = "numpy-2.2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2"},
+ {file = "numpy-2.2.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84"},
+ {file = "numpy-2.2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b"},
+ {file = "numpy-2.2.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d"},
+ {file = "numpy-2.2.6-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566"},
+ {file = "numpy-2.2.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f"},
+ {file = "numpy-2.2.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f"},
+ {file = "numpy-2.2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868"},
+ {file = "numpy-2.2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d"},
+ {file = "numpy-2.2.6-cp313-cp313-win32.whl", hash = "sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd"},
+ {file = "numpy-2.2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c"},
+ {file = "numpy-2.2.6-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6"},
+ {file = "numpy-2.2.6-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda"},
+ {file = "numpy-2.2.6-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40"},
+ {file = "numpy-2.2.6-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8"},
+ {file = "numpy-2.2.6-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f"},
+ {file = "numpy-2.2.6-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa"},
+ {file = "numpy-2.2.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571"},
+ {file = "numpy-2.2.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1"},
+ {file = "numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff"},
+ {file = "numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06"},
+ {file = "numpy-2.2.6-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d"},
+ {file = "numpy-2.2.6-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db"},
+ {file = "numpy-2.2.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543"},
+ {file = "numpy-2.2.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00"},
+ {file = "numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd"},
]
[[package]]
@@ -5158,6 +5139,37 @@ datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
realtime = ["websockets (>=13,<16)"]
voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
+[[package]]
+name = "opencv-python"
+version = "4.12.0.88"
+description = "Wrapper package for OpenCV python bindings."
+optional = false
+python-versions = ">=3.6"
+groups = ["main"]
+files = [
+ {file = "opencv-python-4.12.0.88.tar.gz", hash = "sha256:8b738389cede219405f6f3880b851efa3415ccd674752219377353f017d2994d"},
+ {file = "opencv_python-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:f9a1f08883257b95a5764bf517a32d75aec325319c8ed0f89739a57fae9e92a5"},
+ {file = "opencv_python-4.12.0.88-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:812eb116ad2b4de43ee116fcd8991c3a687f099ada0b04e68f64899c09448e81"},
+ {file = "opencv_python-4.12.0.88-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:51fd981c7df6af3e8f70b1556696b05224c4e6b6777bdd2a46b3d4fb09de1a92"},
+ {file = "opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:092c16da4c5a163a818f120c22c5e4a2f96e0db4f24e659c701f1fe629a690f9"},
+ {file = "opencv_python-4.12.0.88-cp37-abi3-win32.whl", hash = "sha256:ff554d3f725b39878ac6a2e1fa232ec509c36130927afc18a1719ebf4fbf4357"},
+ {file = "opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl", hash = "sha256:d98edb20aa932fd8ebd276a72627dad9dc097695b3d435a4257557bbb49a79d2"},
+]
+
+[package.dependencies]
+numpy = {version = ">=2,<2.3.0", markers = "python_version >= \"3.9\""}
+
+[[package]]
+name = "opendataloader-pdf"
+version = "1.3.0"
+description = "A Python wrapper for the opendataloader-pdf Java CLI."
+optional = false
+python-versions = "<4.0,>=3.9"
+groups = ["main"]
+files = [
+ {file = "opendataloader_pdf-1.3.0-py3-none-any.whl", hash = "sha256:a6b80d8a6d11e21f0712c3d5f2b4fef8a62c3d57eff8e9c5c7b3deefcb411a07"},
+]
+
[[package]]
name = "opensearch-dsl"
version = "2.1.0"
@@ -5739,6 +5751,21 @@ pygments = "*"
[package.extras]
testing = ["ipython", "pexpect", "pytest", "pytest-cov"]
+[[package]]
+name = "pdf2image"
+version = "1.17.0"
+description = "A wrapper around the pdftoppm and pdftocairo command line tools to convert PDF to a PIL Image list."
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+ {file = "pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2"},
+ {file = "pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57"},
+]
+
+[package.dependencies]
+pillow = "*"
+
[[package]]
name = "pexpect"
version = "4.9.0"
@@ -6606,6 +6633,23 @@ dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pyte
docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"]
tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
+[[package]]
+name = "pymupdf"
+version = "1.26.6"
+description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents."
+optional = false
+python-versions = ">=3.10"
+groups = ["main"]
+files = [
+ {file = "pymupdf-1.26.6-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:e46f320a136ad55e5219e8f0f4061bdf3e4c12b126d2740d5a49f73fae7ea176"},
+ {file = "pymupdf-1.26.6-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:6844cd2396553c0fa06de4869d5d5ecb1260e6fc3b9d85abe8fa35f14dd9d688"},
+ {file = "pymupdf-1.26.6-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:617ba69e02c44f0da1c0e039ea4a26cf630849fd570e169c71daeb8ac52a81d6"},
+ {file = "pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:7777d0b7124c2ebc94849536b6a1fb85d158df3b9d873935e63036559391534c"},
+ {file = "pymupdf-1.26.6-cp310-abi3-win32.whl", hash = "sha256:8f3ef05befc90ca6bb0f12983200a7048d5bff3e1c1edef1bb3de60b32cb5274"},
+ {file = "pymupdf-1.26.6-cp310-abi3-win_amd64.whl", hash = "sha256:ce02ca96ed0d1acfd00331a4d41a34c98584d034155b06fd4ec0f051718de7ba"},
+ {file = "pymupdf-1.26.6.tar.gz", hash = "sha256:a2b4531cd4ab36d6f1f794bb6d3c33b49bda22f36d58bb1f3e81cbc10183bd2b"},
+]
+
[[package]]
name = "pynacl"
version = "1.5.0"
@@ -9392,4 +9436,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.1"
python-versions = "~3.12"
-content-hash = "4eabef5281cd51058e626275a93902503c902070bc6c2e4d26616f44e814fe3d"
+content-hash = "d9d5fa4f85de593da403f14418d80173a0e81a0f53336065eaf744c23c3ee1c6"
diff --git a/pyproject.toml b/pyproject.toml
index a685c117ce..b0184716bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -116,6 +116,10 @@ youtube-transcript-api = "^1.0.0"
pypdfium2 = "^4.30.0"
pyarrow = "^21.0.0"
django-zeal = "^2.0.4"
+pdf2image = "^1.17.0"
+opendataloader-pdf = "^1.3.0"
+pymupdf = "^1.26.6"
+opencv-python = "^4.12.0.88"
diff --git a/test_pdfs/notes.pdf b/test_pdfs/notes.pdf
new file mode 100644
index 0000000000..c5f99c95a8
Binary files /dev/null and b/test_pdfs/notes.pdf differ