datafold
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 83 additions & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 76 additions & 569 deletions b/‎README.md‎
Lines changed: 76 additions & 569 deletions
diff --git a/‎data_diff/__main__.py‎
Lines changed: 2 additions & 2 deletions b/‎data_diff/__main__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎data_diff/databases/base.py‎
Lines changed: 5 additions & 4 deletions b/‎data_diff/databases/base.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎data_diff/databases/connect.py‎
Lines changed: 8 additions & 0 deletions b/‎data_diff/databases/connect.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎data_diff/databases/database_types.py‎
Lines changed: 6 additions & 5 deletions b/‎data_diff/databases/database_types.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎data_diff/databases/oracle.py‎
Lines changed: 5 additions & 3 deletions b/‎data_diff/databases/oracle.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎data_diff/databases/redshift.py‎
Lines changed: 2 additions & 2 deletions b/‎data_diff/databases/redshift.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎data_diff/databases/vertica.py‎
Lines changed: 2 additions & 2 deletions b/‎data_diff/databases/vertica.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎data_diff/joindiff_tables.py‎
Lines changed: 1 addition & 2 deletions b/‎data_diff/joindiff_tables.py‎
Lines changed: 1 addition & 2 deletions
@@ -79,3 +79,86 @@ New databases should be added as a new module in the `data-diff/databases/` fold
 If possible, please also add the database setup to `docker-compose.yml`, so that we can run and test it for ourselves. If you do, also update the CI (`ci.yml`).
 
 Guide to implementing a new database driver: https://data-diff.readthedocs.io/en/latest/new-database-driver-guide.html
+
+## Development Setup
+
+The development setup centers around using `docker-compose` to boot up various
+databases, and then inserting data into them.
+
+For Mac for performance of Docker, we suggest enabling in the UI:
+
+* Use new Virtualization Framework
+* Enable VirtioFS accelerated directory sharing
+
+**1. Install Data Diff**
+
+When developing/debugging, it's recommended to install dependencies and run it
+directly with `poetry` rather than go through the package.
+
+```
+$ brew install mysql postgresql # MacOS dependencies for C bindings
+$ apt-get install libpq-dev libmysqlclient-dev # Debian dependencies
+$ pip install poetry # Python dependency isolation tool
+$ poetry install # Install dependencies
+```
+**2. Start Databases**
+
+[Install **docker-compose**][docker-compose] if you haven't already.
+
+```shell-session
+$ docker-compose up -d mysql postgres # run mysql and postgres dbs in background
+```
+
+[docker-compose]: https://docs.docker.com/compose/install/
+
+**3. Run Unit Tests**
+
+There are more than 1000 tests for all the different type and database
+combinations, so we recommend using `unittest-parallel` that's installed as a
+development dependency.
+
+```shell-session
+$ poetry run unittest-parallel -j 16 #  run all tests
+$ poetry run python -m unittest -k <test> #  run individual test
+```
+
+**4. Seed the Database(s) (optional)**
+
+First, download the CSVs of seeding data:
+
+```shell-session
+$ curl https://datafold-public.s3.us-west-2.amazonaws.com/1m.csv -o dev/ratings.csv
+# For a larger data-set (but takes 25x longer to import):
+# - curl https://datafold-public.s3.us-west-2.amazonaws.com/25m.csv -o dev/ratings.csv
+```
+
+Now you can insert it into the testing database(s):
+
+```shell-session
+# It's optional to seed more than one to run data-diff(1) against.
+$ poetry run preql -f dev/prepare_db.pql mysql://mysql:Password1@127.0.0.1:3306/mysql
+$ poetry run preql -f dev/prepare_db.pql postgresql://postgres:Password1@127.0.0.1:5432/postgres
+# Cloud databases
+$ poetry run preql -f dev/prepare_db.pql snowflake://<uri>
+$ poetry run preql -f dev/prepare_db.pql mssql://<uri>
+$ poetry run preql -f dev/prepare_db.pql bigquery:///<project>
+```
+
+**5. Run **data-diff** against seeded database (optional)**
+
+```bash
+poetry run python3 -m data_diff postgresql://postgres:Password1@localhost/postgres rating postgresql://postgres:Password1@localhost/postgres rating_del1 --verbose
+```
+
+**6. Run benchmarks (optional)**
+
+```shell-session
+$ dev/benchmark.sh #  runs benchmarks and puts results in benchmark_<sha>.csv
+$ poetry run python3 dev/graph.py #  create graphs from benchmark_*.csv files
+```
+
+You can adjust how many rows we benchmark with by passing `N_SAMPLES` to `dev/benchmark.sh`:
+
+```shell-session
+$ N_SAMPLES=100000000 dev/benchmark.sh #  100m which is our canonical target
+```
@@ -65,8 +65,8 @@ def __init__(self, **kwargs):
         self.indent_increment = 6
 
     def write_usage(self, prog: str, args: str = "", prefix: Optional[str] = None) -> None:
-        self.write(f"data-diff - efficiently diff rows across database tables.\n\n")
-        self.write(f"Usage:\n")
+        self.write("data-diff - efficiently diff rows across database tables.\n\n")
+        self.write("Usage:\n")
         self.write(f"  * In-db diff:    {prog} <database1> <table1> <table2> [OPTIONS]\n")
         self.write(f"  * Cross-db diff: {prog} <database1> <table1> <database2> <table2> [OPTIONS]\n")
         self.write(f"  * Using config:  {prog} --conf PATH [--run NAME] [OPTIONS]\n")
 
@@ -145,7 +145,7 @@ def query(self, sql_ast: Union[Expr, Generator], res_type: type = list):
                     (row,) = row
                 logger.debug("EXPLAIN: %s", row)
             answer = input("Continue? [y/n] ")
-            if not answer.lower() in ["y", "yes"]:
+            if answer.lower() not in ["y", "yes"]:
                 sys.exit(1)
 
         res = self._query(sql_code)
@@ -327,9 +327,9 @@ def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None
 
         return f"LIMIT {limit}"
 
-    def concat(self, l: List[str]) -> str:
-        assert len(l) > 1
-        joined_exprs = ", ".join(l)
+    def concat(self, items: List[str]) -> str:
+        assert len(items) > 1
+        joined_exprs = ", ".join(items)
         return f"concat({joined_exprs})"
 
     def is_distinct_from(self, a: str, b: str) -> str:
@@ -352,6 +352,7 @@ def _constant_value(self, v):
         elif isinstance(v, str):
             return f"'{v}'"
         elif isinstance(v, datetime):
+            # TODO use self.timestamp_value
             return f"timestamp '{v}'"
         elif isinstance(v, UUID):
             return f"'{v}'"
 
@@ -184,6 +184,8 @@ def connect(db_conf: Union[str, dict], thread_count: Optional[int] = 1) -> Datab
 
     Configuration can be given either as a URI string, or as a dict of {option: value}.
 
+    The dictionary configuration uses the same keys as the TOML 'database' definition given with --conf.
+
     thread_count determines the max number of worker threads per database,
     if relevant. None means no limit.
 
@@ -205,6 +207,12 @@ def connect(db_conf: Union[str, dict], thread_count: Optional[int] = 1) -> Datab
     - trino
     - clickhouse
     - vertica
+
+    Example:
+        >>> connect("mysql://localhost/db")
+        <data_diff.databases.mysql.MySQL object at 0x0000025DB45F4190>
+        >>> connect({"driver": "mysql", "host": "localhost", "database": "db"})
+        <data_diff.databases.mysql.MySQL object at 0x0000025DB3F94820>
     """
     if isinstance(db_conf, str):
         return connect_to_uri(db_conf, thread_count)
 
@@ -147,12 +147,12 @@ class AbstractDialect(ABC):
 
     @abstractmethod
     def quote(self, s: str):
-        "Quote SQL name (implementation specific)"
+        "Quote SQL name"
         ...
 
     @abstractmethod
-    def concat(self, l: List[str]) -> str:
-        "Provide SQL for concatenating a bunch of column into a string"
+    def concat(self, items: List[str]) -> str:
+        "Provide SQL for concatenating a bunch of columns into a string"
         ...
 
     @abstractmethod
@@ -162,12 +162,13 @@ def is_distinct_from(self, a: str, b: str) -> str:
 
     @abstractmethod
     def to_string(self, s: str) -> str:
+        # TODO rewrite using cast_to(x, str)
         "Provide SQL for casting a column to string"
         ...
 
     @abstractmethod
     def random(self) -> str:
-        "Provide SQL for generating a random number"
+        "Provide SQL for generating a random number betweein 0..1"
 
     @abstractmethod
     def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None):
@@ -176,7 +177,7 @@ def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None
 
     @abstractmethod
     def explain_as_text(self, query: str) -> str:
-        "Provide SQL for explaining a query, returned in as table(varchar)"
+        "Provide SQL for explaining a query, returned as table(varchar)"
         ...
 
     @abstractmethod
 
@@ -128,8 +128,8 @@ def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None
 
         return f"FETCH NEXT {limit} ROWS ONLY"
 
-    def concat(self, l: List[str]) -> str:
-        joined_exprs = " || ".join(l)
+    def concat(self, items: List[str]) -> str:
+        joined_exprs = " || ".join(items)
         return f"({joined_exprs})"
 
     def timestamp_value(self, t: DbTime) -> str:
@@ -154,4 +154,6 @@ def type_repr(self, t) -> str:
             return super().type_repr(t)
 
     def constant_values(self, rows) -> str:
-        return " UNION ALL ".join("SELECT %s FROM DUAL" % ", ".join(self._constant_value(v) for v in row) for row in rows)
+        return " UNION ALL ".join(
+            "SELECT %s FROM DUAL" % ", ".join(self._constant_value(v) for v in row) for row in rows
+        )
@@ -36,8 +36,8 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
     def normalize_number(self, value: str, coltype: FractionalType) -> str:
         return self.to_string(f"{value}::decimal(38,{coltype.precision})")
 
-    def concat(self, l: List[str]) -> str:
-        joined_exprs = " || ".join(l)
+    def concat(self, items: List[str]) -> str:
+        joined_exprs = " || ".join(items)
         return f"({joined_exprs})"
 
     def select_table_schema(self, path: DbPath) -> str:
 
@@ -99,8 +99,8 @@ def select_table_schema(self, path: DbPath) -> str:
     def quote(self, s: str):
         return f'"{s}"'
 
-    def concat(self, l: List[str]) -> str:
-        return " || ".join(l)
+    def concat(self, items: List[str]) -> str:
+        return " || ".join(items)
 
     def md5_to_int(self, s: str) -> str:
         return f"CAST(HEX_TO_INTEGER(SUBSTRING(MD5({s}), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS})) AS NUMERIC(38, 0))"
 
@@ -2,7 +2,6 @@
 
 """
 
-from contextlib import suppress
 from decimal import Decimal
 from functools import partial
 import logging
@@ -21,7 +20,7 @@
 from .diff_tables import TableDiffer, DiffResult
 from .thread_utils import ThreadedYielder
 
-from .queries import table, sum_, min_, max_, avg, commit
+from .queries import table, sum_, min_, max_, avg
 from .queries.api import and_, if_, or_, outerjoin, leftjoin, rightjoin, this, ITable
 from .queries.ast_classes import Concat, Count, Expr, Random, TablePath
 from .queries.compiler import Compiler