datafold
diff --git a/‎data_diff/__init__.py‎
Lines changed: 2 additions & 9 deletions b/‎data_diff/__init__.py‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎data_diff/__main__.py‎
Lines changed: 6 additions & 11 deletions b/‎data_diff/__main__.py‎
Lines changed: 6 additions & 11 deletions
diff --git a/‎data_diff/diff_tables.py‎
Lines changed: 7 additions & 257 deletions b/‎data_diff/diff_tables.py‎
Lines changed: 7 additions & 257 deletions
@@ -2,15 +2,8 @@
 
 from .tracking import disable_tracking
 from .databases.connect import connect
-from .diff_tables import (
-    TableSegment,
-    TableDiffer,
-    DEFAULT_BISECTION_THRESHOLD,
-    DEFAULT_BISECTION_FACTOR,
-    DbKey,
-    DbTime,
-    DbPath,
-)
+from .databases.database_types import DbKey, DbTime, DbPath
+from .diff_tables import TableSegment, TableDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
 
 
 def connect_to_table(
 
@@ -5,23 +5,18 @@
 import logging
 from itertools import islice
 
-from data_diff.tracking import disable_tracking
+import rich
+import click
 
-from .utils import remove_password_from_url, safezip, match_like
 
-from .diff_tables import (
-    TableSegment,
-    TableDiffer,
-    DEFAULT_BISECTION_THRESHOLD,
-    DEFAULT_BISECTION_FACTOR,
-    create_schema,
-)
+from .utils import remove_password_from_url, safezip, match_like
+from .diff_tables import TableDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
+from .table_segment import create_schema, TableSegment
 from .databases.connect import connect
 from .parse_time import parse_time_before_now, UNITS_STR, ParseError
 from .config import apply_config_from_file
+from .tracking import disable_tracking
 
-import rich
-import click
 
 LOG_FORMAT = "[%(asctime)s] %(levelname)s - %(message)s"
 DATE_FORMAT = "%H:%M:%S"
 
@@ -6,275 +6,24 @@
 from numbers import Number
 from operator import attrgetter, methodcaller
 from collections import defaultdict
-from typing import List, Tuple, Iterator, Optional
+from typing import Tuple, Iterator, Optional
 import logging
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from runtype import dataclass
 
+from .utils import safezip, run_as_daemon
+from .databases.database_types import IKey, NumericType, PrecisionType, StringType
+from .table_segment import TableSegment
 from .tracking import create_end_event_json, create_start_event_json, send_event_json, is_tracking_enabled
-from .sql import Select, Checksum, Compare, Count, TableName, Time, Value
-from .utils import (
-    CaseAwareMapping,
-    CaseInsensitiveDict,
-    safezip,
-    split_space,
-    CaseSensitiveDict,
-    ArithString,
-    run_as_daemon,
-)
-from .databases.base import Database
-from .databases.database_types import (
-    DbPath,
-    DbKey,
-    DbTime,
-    IKey,
-    Native_UUID,
-    NumericType,
-    PrecisionType,
-    StringType,
-    Schema,
-)
 
 logger = logging.getLogger("diff_tables")
 
-RECOMMENDED_CHECKSUM_DURATION = 10
 BENCHMARK = os.environ.get("BENCHMARK", False)
 DEFAULT_BISECTION_THRESHOLD = 1024 * 16
 DEFAULT_BISECTION_FACTOR = 32
 
 
-def create_schema(db: Database, table_path: DbPath, schema: dict, case_sensitive: bool) -> CaseAwareMapping:
-    logger.debug(f"[{db.name}] Schema = {schema}")
-
-    if case_sensitive:
-        return CaseSensitiveDict(schema)
-
-    if len({k.lower() for k in schema}) < len(schema):
-        logger.warning(f'Ambiguous schema for {db}:{".".join(table_path)} | Columns = {", ".join(list(schema))}')
-        logger.warning("We recommend to disable case-insensitivity (remove --any-case).")
-    return CaseInsensitiveDict(schema)
-
-
-@dataclass(frozen=False)
-class TableSegment:
-    """Signifies a segment of rows (and selected columns) within a table
-
-    Parameters:
-        database (Database): Database instance. See :meth:`connect`
-        table_path (:data:`DbPath`): Path to table in form of a tuple. e.g. `('my_dataset', 'table_name')`
-        key_column (str): Name of the key column, which uniquely identifies each row (usually id)
-        update_column (str, optional): Name of updated column, which signals that rows changed (usually updated_at or last_update)
-        extra_columns (Tuple[str, ...], optional): Extra columns to compare
-        min_key (:data:`DbKey`, optional): Lowest key_column value, used to restrict the segment
-        max_key (:data:`DbKey`, optional): Highest key_column value, used to restrict the segment
-        min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
-        max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
-        where (str, optional): An additional 'where' expression to restrict the search space.
-
-        case_sensitive (bool): If false, the case of column names will adjust according to the schema. Default is true.
-
-    """
-
-    # Location of table
-    database: Database
-    table_path: DbPath
-
-    # Columns
-    key_column: str
-    update_column: str = None
-    extra_columns: Tuple[str, ...] = ()
-
-    # Restrict the segment
-    min_key: DbKey = None
-    max_key: DbKey = None
-    min_update: DbTime = None
-    max_update: DbTime = None
-
-    where: str = None
-    case_sensitive: bool = True
-    _schema: Schema = None
-
-    def __post_init__(self):
-        if not self.update_column and (self.min_update or self.max_update):
-            raise ValueError("Error: the min_update/max_update feature requires 'update_column' to be set.")
-
-        if self.min_key is not None and self.max_key is not None and self.min_key >= self.max_key:
-            raise ValueError(f"Error: min_key expected to be smaller than max_key! ({self.min_key} >= {self.max_key})")
-
-        if self.min_update is not None and self.max_update is not None and self.min_update >= self.max_update:
-            raise ValueError(
-                f"Error: min_update expected to be smaller than max_update! ({self.min_update} >= {self.max_update})"
-            )
-
-    @property
-    def _update_column(self):
-        return self._quote_column(self.update_column)
-
-    def _quote_column(self, c: str) -> str:
-        if self._schema:
-            c = self._schema.get_key(c)  # Get the actual name. Might be case-insensitive.
-        return self.database.quote(c)
-
-    def _normalize_column(self, name: str, template: str = None) -> str:
-        if not self._schema:
-            raise RuntimeError(
-                "Cannot compile query when the schema is unknown. Please use TableSegment.with_schema()."
-            )
-
-        col_type = self._schema[name]
-        col = self._quote_column(name)
-
-        if isinstance(col_type, Native_UUID):
-            # Normalize first, apply template after (for uuids)
-            # Needed because min/max(uuid) fails in postgresql
-            col = self.database.normalize_value_by_type(col, col_type)
-            if template is not None:
-                col = template % col  # Apply template using Python's string formatting
-            return col
-
-        # Apply template before normalizing (for ints)
-        if template is not None:
-            col = template % col  # Apply template using Python's string formatting
-
-        return self.database.normalize_value_by_type(col, col_type)
-
-    def _with_raw_schema(self, raw_schema: dict) -> "TableSegment":
-        schema = self.database._process_table_schema(self.table_path, raw_schema, self._relevant_columns)
-        return self.new(_schema=create_schema(self.database, self.table_path, schema, self.case_sensitive))
-
-    def with_schema(self) -> "TableSegment":
-        "Queries the table schema from the database, and returns a new instance of TableSegment, with a schema."
-        if self._schema:
-            return self
-
-        return self._with_raw_schema(self.database.query_table_schema(self.table_path))
-
-    def _make_key_range(self):
-        if self.min_key is not None:
-            yield Compare("<=", Value(self.min_key), self._quote_column(self.key_column))
-        if self.max_key is not None:
-            yield Compare("<", self._quote_column(self.key_column), Value(self.max_key))
-
-    def _make_update_range(self):
-        if self.min_update is not None:
-            yield Compare("<=", Time(self.min_update), self._update_column)
-        if self.max_update is not None:
-            yield Compare("<", self._update_column, Time(self.max_update))
-
-    def _make_select(self, *, table=None, columns=None, where=None, group_by=None, order_by=None):
-        if columns is None:
-            columns = [self._normalize_column(self.key_column)]
-        where = [
-            *self._make_key_range(),
-            *self._make_update_range(),
-            *([] if where is None else [where]),
-            *([] if self.where is None else [self.where]),
-        ]
-        order_by = None if order_by is None else [order_by]
-        return Select(
-            table=table or TableName(self.table_path),
-            where=where,
-            columns=columns,
-            group_by=group_by,
-            order_by=order_by,
-        )
-
-    def get_values(self) -> list:
-        "Download all the relevant values of the segment from the database"
-        select = self._make_select(columns=self._relevant_columns_repr)
-        return self.database.query(select, List[Tuple])
-
-    def choose_checkpoints(self, count: int) -> List[DbKey]:
-        "Suggests a bunch of evenly-spaced checkpoints to split by (not including start, end)"
-        assert self.is_bounded
-        if isinstance(self.min_key, ArithString):
-            assert type(self.min_key) is type(self.max_key)
-            checkpoints = split_space(self.min_key.int, self.max_key.int, count)
-            return [self.min_key.new(int=i) for i in checkpoints]
-
-        return split_space(self.min_key, self.max_key, count)
-
-    def segment_by_checkpoints(self, checkpoints: List[DbKey]) -> List["TableSegment"]:
-        "Split the current TableSegment to a bunch of smaller ones, separated by the given checkpoints"
-
-        if self.min_key and self.max_key:
-            assert all(self.min_key <= c < self.max_key for c in checkpoints)
-        checkpoints.sort()
-
-        # Calculate sub-segments
-        positions = [self.min_key] + checkpoints + [self.max_key]
-        ranges = list(zip(positions[:-1], positions[1:]))
-
-        # Create table segments
-        tables = [self.new(min_key=s, max_key=e) for s, e in ranges]
-
-        return tables
-
-    def new(self, **kwargs) -> "TableSegment":
-        """Using new() creates a copy of the instance using 'replace()'"""
-        return self.replace(**kwargs)
-
-    @property
-    def _relevant_columns(self) -> List[str]:
-        extras = list(self.extra_columns)
-
-        if self.update_column and self.update_column not in extras:
-            extras = [self.update_column] + extras
-
-        return [self.key_column] + extras
-
-    @property
-    def _relevant_columns_repr(self) -> List[str]:
-        return [self._normalize_column(c) for c in self._relevant_columns]
-
-    def count(self) -> Tuple[int, int]:
-        """Count how many rows are in the segment, in one pass."""
-        return self.database.query(self._make_select(columns=[Count()]), int)
-
-    def count_and_checksum(self) -> Tuple[int, int]:
-        """Count and checksum the rows in the segment, in one pass."""
-        start = time.monotonic()
-        count, checksum = self.database.query(
-            self._make_select(columns=[Count(), Checksum(self._relevant_columns_repr)]), tuple
-        )
-        duration = time.monotonic() - start
-        if duration > RECOMMENDED_CHECKSUM_DURATION:
-            logger.warning(
-                f"Checksum is taking longer than expected ({duration:.2f}s). "
-                "We recommend increasing --bisection-factor or decreasing --threads."
-            )
-
-        if count:
-            assert checksum, (count, checksum)
-        return count or 0, checksum if checksum is None else int(checksum)
-
-    def query_key_range(self) -> Tuple[int, int]:
-        """Query database for minimum and maximum key. This is used for setting the initial bounds."""
-        # Normalizes the result (needed for UUIDs) after the min/max computation
-        select = self._make_select(
-            columns=[
-                self._normalize_column(self.key_column, "min(%s)"),
-                self._normalize_column(self.key_column, "max(%s)"),
-            ]
-        )
-        min_key, max_key = self.database.query(select, tuple)
-
-        if min_key is None or max_key is None:
-            raise ValueError("Table appears to be empty")
-
-        return min_key, max_key
-
-    @property
-    def is_bounded(self):
-        return self.min_key is not None and self.max_key is not None
-
-    def approximate_size(self):
-        if not self.is_bounded:
-            raise RuntimeError("Cannot approximate the size of an unbounded segment. Must have min_key and max_key.")
-        return self.max_key - self.min_key
-
-
 def diff_sets(a: set, b: set) -> Iterator:
     s1 = set(a)
     s2 = set(b)
@@ -346,6 +95,7 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
 
         self.stats["diff_count"] = 0
         start = time.monotonic()
+        error = None
         try:
 
             # Query and validate schema
@@ -388,7 +138,6 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
                 post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
                 yield from self._bisect_and_diff_tables(*post_tables)
 
-            error = None
         except BaseException as e:  # Catch KeyboardInterrupt too
             error = e
         finally:
@@ -559,7 +308,8 @@ def _threaded_call(self, func, iterable):
 
     def _thread_as_completed(self, func, iterable):
         if not self.threaded:
-            return map(func, iterable)
+            yield from map(func, iterable)
+            return
 
         with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
             futures = [task_pool.submit(func, item) for item in iterable]