Bugfix in TableSegment: Sampling now respects the 'where' clause (issue #221)

erezsh · erezsh · commit 57cb682a0629 · 2022-09-02T12:16:24.000+02:00
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -187,25 +187,28 @@ def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
         assert len(d) == len(rows)
         return d
 
-    def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str]):
+    def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str], where: str = None):
         accept = {i.lower() for i in filter_columns}
 
         col_dict = {row[0]: self._parse_type(path, *row) for name, row in raw_schema.items() if name.lower() in accept}
 
-        self._refine_coltypes(path, col_dict)
+        self._refine_coltypes(path, col_dict, where)
 
         # Return a dict of form {name: type} after normalization
         return col_dict
 
-    def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType]):
-        "Refine the types in the column dict, by querying the database for a sample of their values"
+    def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType], where: str = None):
+        """Refine the types in the column dict, by querying the database for a sample of their values
+
+        'where' restricts the rows to be sampled.
+        """
 
         text_columns = [k for k, v in col_dict.items() if isinstance(v, Text)]
         if not text_columns:
             return
 
         fields = [self.normalize_uuid(c, String_UUID()) for c in text_columns]
-        samples_by_row = self.query(Select(fields, TableName(table_path), limit=16), list)
+        samples_by_row = self.query(Select(fields, TableName(table_path), limit=16, where=where and [where]), list)
         if not samples_by_row:
             raise ValueError(f"Table {table_path} is empty.")
 
diff --git a/data_diff/databases/database_types.py b/data_diff/databases/database_types.py
@@ -177,7 +177,7 @@ def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
         ...
 
     @abstractmethod
-    def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str]):
+    def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str], where: str = None):
         """Process the result of query_table_schema().
 
         Done in a separate step, to minimize the amount of processed columns.
diff --git a/data_diff/databases/databricks.py b/data_diff/databases/databricks.py
@@ -83,7 +83,7 @@ def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
             assert len(d) == len(rows)
             return d
 
-    def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str]):
+    def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str], where: str = None):
         accept = {i.lower() for i in filter_columns}
         rows = [row for name, row in raw_schema.items() if name.lower() in accept]
 
@@ -115,7 +115,7 @@ def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filt
 
         col_dict: Dict[str, ColType] = {row[0]: self._parse_type(path, *row) for row in resulted_rows}
 
-        self._refine_coltypes(path, col_dict)
+        self._refine_coltypes(path, col_dict, where)
         return col_dict
 
     def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
diff --git a/data_diff/table_segment.py b/data_diff/table_segment.py
@@ -111,7 +111,7 @@ def _normalize_column(self, name: str, template: str = None) -> str:
         return self.database.normalize_value_by_type(col, col_type)
 
     def _with_raw_schema(self, raw_schema: dict) -> "TableSegment":
-        schema = self.database._process_table_schema(self.table_path, raw_schema, self._relevant_columns)
+        schema = self.database._process_table_schema(self.table_path, raw_schema, self._relevant_columns, self.where)
         return self.new(_schema=create_schema(self.database, self.table_path, schema, self.case_sensitive))
 
     def with_schema(self) -> "TableSegment":
diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py
@@ -443,6 +443,16 @@ def test_string_keys(self):
 
         self.assertRaises(ValueError, list, differ.diff_tables(self.a, self.b))
 
+    def test_where_sampling(self):
+        a = self.a.replace(where='1=1')
+
+        differ = TableDiffer()
+        diff = list(differ.diff_tables(a, self.b))
+        self.assertEqual(diff, [("-", (str(self.new_uuid), "This one is different"))])
+
+        a_empty = self.a.replace(where='1=0')
+        self.assertRaises(ValueError, list, differ.diff_tables(a_empty, self.b))
+
 
 @test_per_database
 class TestAlphanumericKeys(TestPerDatabase):