Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 01267fd

Browse files
authored
Merge branch 'master' into test_unique_keys
2 parents 3e82588 + 1a86c58 commit 01267fd

21 files changed

+736
-758
lines changed

CONTRIBUTING.md

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,86 @@ New databases should be added as a new module in the `data-diff/databases/` fold
7979
If possible, please also add the database setup to `docker-compose.yml`, so that we can run and test it for ourselves. If you do, also update the CI (`ci.yml`).
8080

8181
Guide to implementing a new database driver: https://data-diff.readthedocs.io/en/latest/new-database-driver-guide.html
82+
83+
## Development Setup
84+
85+
The development setup centers around using `docker-compose` to boot up various
86+
databases, and then inserting data into them.
87+
88+
For Mac for performance of Docker, we suggest enabling in the UI:
89+
90+
* Use new Virtualization Framework
91+
* Enable VirtioFS accelerated directory sharing
92+
93+
**1. Install Data Diff**
94+
95+
When developing/debugging, it's recommended to install dependencies and run it
96+
directly with `poetry` rather than go through the package.
97+
98+
```
99+
$ brew install mysql postgresql # MacOS dependencies for C bindings
100+
$ apt-get install libpq-dev libmysqlclient-dev # Debian dependencies
101+
$ pip install poetry # Python dependency isolation tool
102+
$ poetry install # Install dependencies
103+
```
104+
**2. Start Databases**
105+
106+
[Install **docker-compose**][docker-compose] if you haven't already.
107+
108+
```shell-session
109+
$ docker-compose up -d mysql postgres # run mysql and postgres dbs in background
110+
```
111+
112+
[docker-compose]: https://docs.docker.com/compose/install/
113+
114+
**3. Run Unit Tests**
115+
116+
There are more than 1000 tests for all the different type and database
117+
combinations, so we recommend using `unittest-parallel` that's installed as a
118+
development dependency.
119+
120+
```shell-session
121+
$ poetry run unittest-parallel -j 16 # run all tests
122+
$ poetry run python -m unittest -k <test> # run individual test
123+
```
124+
125+
**4. Seed the Database(s) (optional)**
126+
127+
First, download the CSVs of seeding data:
128+
129+
```shell-session
130+
$ curl https://datafold-public.s3.us-west-2.amazonaws.com/1m.csv -o dev/ratings.csv
131+
# For a larger data-set (but takes 25x longer to import):
132+
# - curl https://datafold-public.s3.us-west-2.amazonaws.com/25m.csv -o dev/ratings.csv
133+
```
134+
135+
Now you can insert it into the testing database(s):
136+
137+
```shell-session
138+
# It's optional to seed more than one to run data-diff(1) against.
139+
$ poetry run preql -f dev/prepare_db.pql mysql://mysql:Password1@127.0.0.1:3306/mysql
140+
$ poetry run preql -f dev/prepare_db.pql postgresql://postgres:Password1@127.0.0.1:5432/postgres
141+
# Cloud databases
142+
$ poetry run preql -f dev/prepare_db.pql snowflake://<uri>
143+
$ poetry run preql -f dev/prepare_db.pql mssql://<uri>
144+
$ poetry run preql -f dev/prepare_db.pql bigquery:///<project>
145+
```
146+
147+
**5. Run **data-diff** against seeded database (optional)**
148+
149+
```bash
150+
poetry run python3 -m data_diff postgresql://postgres:Password1@localhost/postgres rating postgresql://postgres:Password1@localhost/postgres rating_del1 --verbose
151+
```
152+
153+
**6. Run benchmarks (optional)**
154+
155+
```shell-session
156+
$ dev/benchmark.sh # runs benchmarks and puts results in benchmark_<sha>.csv
157+
$ poetry run python3 dev/graph.py # create graphs from benchmark_*.csv files
158+
```
159+
160+
You can adjust how many rows we benchmark with by passing `N_SAMPLES` to `dev/benchmark.sh`:
161+
162+
```shell-session
163+
$ N_SAMPLES=100000000 dev/benchmark.sh # 100m which is our canonical target
164+
```

README.md

Lines changed: 76 additions & 569 deletions
Large diffs are not rendered by default.

data_diff/__main__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ def __init__(self, **kwargs):
6565
self.indent_increment = 6
6666

6767
def write_usage(self, prog: str, args: str = "", prefix: Optional[str] = None) -> None:
68-
self.write(f"data-diff - efficiently diff rows across database tables.\n\n")
69-
self.write(f"Usage:\n")
68+
self.write("data-diff - efficiently diff rows across database tables.\n\n")
69+
self.write("Usage:\n")
7070
self.write(f" * In-db diff: {prog} <database1> <table1> <table2> [OPTIONS]\n")
7171
self.write(f" * Cross-db diff: {prog} <database1> <table1> <database2> <table2> [OPTIONS]\n")
7272
self.write(f" * Using config: {prog} --conf PATH [--run NAME] [OPTIONS]\n")

data_diff/databases/base.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def query(self, sql_ast: Union[Expr, Generator], res_type: type = list):
145145
(row,) = row
146146
logger.debug("EXPLAIN: %s", row)
147147
answer = input("Continue? [y/n] ")
148-
if not answer.lower() in ["y", "yes"]:
148+
if answer.lower() not in ["y", "yes"]:
149149
sys.exit(1)
150150

151151
res = self._query(sql_code)
@@ -327,9 +327,9 @@ def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None
327327

328328
return f"LIMIT {limit}"
329329

330-
def concat(self, l: List[str]) -> str:
331-
assert len(l) > 1
332-
joined_exprs = ", ".join(l)
330+
def concat(self, items: List[str]) -> str:
331+
assert len(items) > 1
332+
joined_exprs = ", ".join(items)
333333
return f"concat({joined_exprs})"
334334

335335
def is_distinct_from(self, a: str, b: str) -> str:
@@ -352,6 +352,7 @@ def _constant_value(self, v):
352352
elif isinstance(v, str):
353353
return f"'{v}'"
354354
elif isinstance(v, datetime):
355+
# TODO use self.timestamp_value
355356
return f"timestamp '{v}'"
356357
elif isinstance(v, UUID):
357358
return f"'{v}'"

data_diff/databases/connect.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,8 @@ def connect(db_conf: Union[str, dict], thread_count: Optional[int] = 1) -> Datab
184184
185185
Configuration can be given either as a URI string, or as a dict of {option: value}.
186186
187+
The dictionary configuration uses the same keys as the TOML 'database' definition given with --conf.
188+
187189
thread_count determines the max number of worker threads per database,
188190
if relevant. None means no limit.
189191
@@ -205,6 +207,12 @@ def connect(db_conf: Union[str, dict], thread_count: Optional[int] = 1) -> Datab
205207
- trino
206208
- clickhouse
207209
- vertica
210+
211+
Example:
212+
>>> connect("mysql://localhost/db")
213+
<data_diff.databases.mysql.MySQL object at 0x0000025DB45F4190>
214+
>>> connect({"driver": "mysql", "host": "localhost", "database": "db"})
215+
<data_diff.databases.mysql.MySQL object at 0x0000025DB3F94820>
208216
"""
209217
if isinstance(db_conf, str):
210218
return connect_to_uri(db_conf, thread_count)

data_diff/databases/database_types.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,12 @@ class AbstractDialect(ABC):
147147

148148
@abstractmethod
149149
def quote(self, s: str):
150-
"Quote SQL name (implementation specific)"
150+
"Quote SQL name"
151151
...
152152

153153
@abstractmethod
154-
def concat(self, l: List[str]) -> str:
155-
"Provide SQL for concatenating a bunch of column into a string"
154+
def concat(self, items: List[str]) -> str:
155+
"Provide SQL for concatenating a bunch of columns into a string"
156156
...
157157

158158
@abstractmethod
@@ -162,12 +162,13 @@ def is_distinct_from(self, a: str, b: str) -> str:
162162

163163
@abstractmethod
164164
def to_string(self, s: str) -> str:
165+
# TODO rewrite using cast_to(x, str)
165166
"Provide SQL for casting a column to string"
166167
...
167168

168169
@abstractmethod
169170
def random(self) -> str:
170-
"Provide SQL for generating a random number"
171+
"Provide SQL for generating a random number betweein 0..1"
171172

172173
@abstractmethod
173174
def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None):
@@ -176,7 +177,7 @@ def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None
176177

177178
@abstractmethod
178179
def explain_as_text(self, query: str) -> str:
179-
"Provide SQL for explaining a query, returned in as table(varchar)"
180+
"Provide SQL for explaining a query, returned as table(varchar)"
180181
...
181182

182183
@abstractmethod

data_diff/databases/oracle.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,8 @@ def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None
128128

129129
return f"FETCH NEXT {limit} ROWS ONLY"
130130

131-
def concat(self, l: List[str]) -> str:
132-
joined_exprs = " || ".join(l)
131+
def concat(self, items: List[str]) -> str:
132+
joined_exprs = " || ".join(items)
133133
return f"({joined_exprs})"
134134

135135
def timestamp_value(self, t: DbTime) -> str:
@@ -154,4 +154,6 @@ def type_repr(self, t) -> str:
154154
return super().type_repr(t)
155155

156156
def constant_values(self, rows) -> str:
157-
return " UNION ALL ".join("SELECT %s FROM DUAL" % ", ".join(self._constant_value(v) for v in row) for row in rows)
157+
return " UNION ALL ".join(
158+
"SELECT %s FROM DUAL" % ", ".join(self._constant_value(v) for v in row) for row in rows
159+
)

data_diff/databases/redshift.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
3636
def normalize_number(self, value: str, coltype: FractionalType) -> str:
3737
return self.to_string(f"{value}::decimal(38,{coltype.precision})")
3838

39-
def concat(self, l: List[str]) -> str:
40-
joined_exprs = " || ".join(l)
39+
def concat(self, items: List[str]) -> str:
40+
joined_exprs = " || ".join(items)
4141
return f"({joined_exprs})"
4242

4343
def select_table_schema(self, path: DbPath) -> str:

data_diff/databases/vertica.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@ def select_table_schema(self, path: DbPath) -> str:
9999
def quote(self, s: str):
100100
return f'"{s}"'
101101

102-
def concat(self, l: List[str]) -> str:
103-
return " || ".join(l)
102+
def concat(self, items: List[str]) -> str:
103+
return " || ".join(items)
104104

105105
def md5_to_int(self, s: str) -> str:
106106
return f"CAST(HEX_TO_INTEGER(SUBSTRING(MD5({s}), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS})) AS NUMERIC(38, 0))"

data_diff/joindiff_tables.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
33
"""
44

5-
from contextlib import suppress
65
from decimal import Decimal
76
from functools import partial
87
import logging
@@ -21,7 +20,7 @@
2120
from .diff_tables import TableDiffer, DiffResult
2221
from .thread_utils import ThreadedYielder
2322

24-
from .queries import table, sum_, min_, max_, avg, commit
23+
from .queries import table, sum_, min_, max_, avg
2524
from .queries.api import and_, if_, or_, outerjoin, leftjoin, rightjoin, this, ITable
2625
from .queries.ast_classes import Concat, Count, Expr, Random, TablePath
2726
from .queries.compiler import Compiler

0 commit comments

Comments
 (0)