Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 7454556

Browse files
authored
Merge branch 'master' into bigquery-dbt-impersonation
2 parents 459fdde + 3752f5c commit 7454556

File tree

11 files changed

+129
-23
lines changed

11 files changed

+129
-23
lines changed

README.md

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,18 +81,73 @@ More information about the algorithm and performance considerations can be found
8181
pip install data-diff 'data-diff[postgresql,snowflake]' -U
8282
```
8383

84-
Run `data-diff` with connection URIs. In the following example, we compare tables between PostgreSQL and Snowflake using hashdiff algorithm:
85-
```
84+
Run `data-diff` with connection URIs. In the following example, we compare tables between PostgreSQL and Snowflake using the hashdiff algorithm:
85+
86+
```bash
8687
data-diff \
8788
postgresql://<username>:'<password>'@localhost:5432/<database> \
8889
<table> \
89-
"snowflake://<username>:<password>@<password>/<DATABASE>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<ROLE>" \
90+
"snowflake://<username>:<password>@<account>/<DATABASE>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<ROLE>" \
9091
<TABLE> \
9192
-k <primary key column> \
9293
-c <columns to compare> \
9394
-w <filter condition>
9495
```
9596

97+
Run `data-diff` with a `toml` configuration file. In the following example, we compare tables between MotherDuck(hosted DuckDB) and Snowflake using the hashdiff algorithm:
98+
99+
```toml
100+
## DATABASE CONNECTION ##
101+
[database.duckdb_connection]
102+
driver = "duckdb"
103+
# filepath = "datafold_demo.duckdb" # local duckdb file example
104+
# filepath = "md:" # default motherduck connection example
105+
filepath = "md:datafold_demo?motherduck_token=${motherduck_token}" # API token recommended for motherduck connection
106+
database = "datafold_demo"
107+
108+
[database.snowflake_connection]
109+
driver = "snowflake"
110+
database = "DEV"
111+
user = "sung"
112+
password = "${SNOWFLAKE_PASSWORD}" # or "<PASSWORD_STRING>"
113+
# the info below is only required for snowflake
114+
account = "${ACCOUNT}" # by33919
115+
schema = "DEVELOPMENT"
116+
warehouse = "DEMO"
117+
role = "DEMO_ROLE"
118+
119+
## RUN PARAMETERS ##
120+
[run.default]
121+
verbose = true
122+
123+
## EXAMPLE DATA DIFF JOB ##
124+
[run.demo_xdb_diff]
125+
# Source 1 ("left")
126+
1.database = "duckdb_connection"
127+
1.table = "development.raw_orders"
128+
129+
# Source 2 ("right")
130+
2.database = "snowflake_connection"
131+
2.table = "RAW_ORDERS" # note that snowflake table names are case-sensitive
132+
133+
verbose = false
134+
```
135+
136+
```bash
137+
# export relevant environment variables, example below
138+
export motherduck_token=<MOTHERDUCK_TOKEN>
139+
140+
# run the configured data-diff job
141+
data-diff --conf datadiff.toml \
142+
--run demo_xdb_diff \
143+
-k "id" \
144+
-c status
145+
146+
# output example
147+
- 1, completed
148+
+ 1, returned
149+
```
150+
96151
Check out [documentation](https://docs.datafold.com/reference/open_source/cli) for the full command reference.
97152

98153

@@ -106,13 +161,14 @@ Check out [documentation](https://docs.datafold.com/reference/open_source/cli) f
106161
| Snowflake | 🟢 | `"snowflake://<user>[:<password>]@<account>/<database>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<role>[&authenticator=externalbrowser]"` |
107162
| BigQuery | 🟢 | `bigquery://<project>/<dataset>` |
108163
| Redshift | 🟢 | `redshift://<username>:<password>@<hostname>:5439/<database>` |
164+
| DuckDB | 🟢 | `duckdb://<dbname>@<filepath>` |
165+
| MotherDuck | 🟢 | `duckdb://<dbname>@<filepath>` |
109166
| Oracle | 🟡 | `oracle://<username>:<password>@<hostname>/servive_or_sid` |
110167
| Presto | 🟡 | `presto://<username>:<password>@<hostname>:8080/<database>` |
111168
| Databricks | 🟡 | `databricks://<http_path>:<access_token>@<server_hostname>/<catalog>/<schema>` |
112169
| Trino | 🟡 | `trino://<username>:<password>@<hostname>:8080/<database>` |
113170
| Clickhouse | 🟡 | `clickhouse://<username>:<password>@<hostname>:9000/<database>` |
114171
| Vertica | 🟡 | `vertica://<username>:<password>@<hostname>:5433/<database>` |
115-
| DuckDB | 🟡 | |
116172
| ElasticSearch | 📝 | |
117173
| Planetscale | 📝 | |
118174
| Pinot | 📝 | |

data_diff/__main__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,13 @@ def _get_log_handlers(is_dbt: Optional[bool] = False) -> Dict[str, logging.Handl
5959
return handlers
6060

6161

62-
def _remove_passwords_in_dict(d: dict):
62+
def _remove_passwords_in_dict(d: dict) -> None:
6363
for k, v in d.items():
6464
if k == "password":
6565
d[k] = "*" * len(v)
66+
elif k == "filepath":
67+
if "motherduck_token=" in v:
68+
d[k] = v.split("motherduck_token=")[0] + "motherduck_token=**********"
6669
elif isinstance(v, dict):
6770
_remove_passwords_in_dict(v)
6871
elif k.startswith("database"):

data_diff/databases/redshift.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,12 @@
1919
TIMESTAMP_PRECISION_POS,
2020
PostgresqlDialect,
2121
Mixin_NormalizeValue,
22+
Mixin_MD5,
2223
)
2324

2425

2526
@attrs.define(frozen=False)
26-
class Mixin_MD5(AbstractMixin_MD5):
27+
class Mixin_MD5(Mixin_MD5):
2728
def md5_as_int(self, s: str) -> str:
2829
return f"strtol(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16)::decimal(38)"
2930

data_diff/diff_tables.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class Algorithm(Enum):
3030
DiffResult = Iterator[Tuple[str, tuple]] # Iterator[Tuple[Literal["+", "-"], tuple]]
3131

3232

33-
@attrs.define(frozen=True)
33+
@attrs.define(frozen=False)
3434
class ThreadBase:
3535
"Provides utility methods for optional threading"
3636

@@ -179,7 +179,7 @@ def get_stats_dict(self, is_dbt: bool = False):
179179
return json_output
180180

181181

182-
@attrs.define(frozen=True)
182+
@attrs.define(frozen=False)
183183
class TableDiffer(ThreadBase, ABC):
184184
bisection_factor = 32
185185
stats: dict = {}

data_diff/hashdiff_tables.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def diff_sets(a: list, b: list, json_cols: dict = None) -> Iterator:
5252
yield from v
5353

5454

55-
@attrs.define(frozen=True)
55+
@attrs.define(frozen=False)
5656
class HashDiffer(TableDiffer):
5757
"""Finds the diff between two SQL tables
5858

data_diff/joindiff_tables.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
import attrs
1111

12-
from data_diff.databases import Database, MsSQL, MySQL, BigQuery, Presto, Oracle, Snowflake
12+
from data_diff.databases import Database, MsSQL, MySQL, BigQuery, Presto, Oracle, Snowflake, DuckDB
1313
from data_diff.abcs.database_types import NumericType, DbPath
1414
from data_diff.databases.base import Compiler
1515
from data_diff.queries.api import (
@@ -110,7 +110,7 @@ def json_friendly_value(v):
110110
return v
111111

112112

113-
@attrs.define(frozen=True)
113+
@attrs.define(frozen=False)
114114
class JoinDiffer(TableDiffer):
115115
"""Finds the diff between two SQL tables in the same database, using JOINs.
116116
@@ -157,7 +157,7 @@ def _diff_tables_root(self, table1: TableSegment, table2: TableSegment, info_tre
157157
drop_table(db, self.materialize_to_table)
158158

159159
with self._run_in_background(*bg_funcs):
160-
if isinstance(db, (Snowflake, BigQuery)):
160+
if isinstance(db, (Snowflake, BigQuery, DuckDB)):
161161
# Don't segment the table; let the database handling parallelization
162162
yield from self._diff_segments(None, table1, table2, info_tree, None)
163163
else:

data_diff/utils.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,9 @@ def remove_passwords_in_dict(d: dict, replace_with: str = "***"):
270270
for k, v in d.items():
271271
if k == "password":
272272
d[k] = replace_with
273+
elif k == "filepath":
274+
if "motherduck_token=" in v:
275+
d[k] = v.split("motherduck_token=")[0] + f"motherduck_token={replace_with}"
273276
elif isinstance(v, dict):
274277
remove_passwords_in_dict(v, replace_with)
275278
elif k.startswith("database"):
@@ -284,14 +287,18 @@ def _join_if_any(sym, args):
284287

285288

286289
def remove_password_from_url(url: str, replace_with: str = "***") -> str:
287-
parsed = urlparse(url)
288-
account = parsed.username or ""
289-
if parsed.password:
290-
account += ":" + replace_with
291-
host = _join_if_any(":", filter(None, [parsed.hostname, parsed.port]))
292-
netloc = _join_if_any("@", filter(None, [account, host]))
293-
replaced = parsed._replace(netloc=netloc)
294-
return replaced.geturl()
290+
if "motherduck_token=" in url:
291+
replace_token_url = url.split("motherduck_token=")[0] + f"motherduck_token={replace_with}"
292+
return replace_token_url
293+
else:
294+
parsed = urlparse(url)
295+
account = parsed.username or ""
296+
if parsed.password:
297+
account += ":" + replace_with
298+
host = _join_if_any(":", filter(None, [parsed.hostname, parsed.port]))
299+
netloc = _join_if_any("@", filter(None, [account, host]))
300+
replaced = parsed._replace(netloc=netloc)
301+
return replaced.geturl()
295302

296303

297304
def match_like(pattern: str, strs: Sequence[str]) -> Iterable[str]:

data_diff/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.9.3"
1+
__version__ = "0.9.5"

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "data-diff"
3-
version = "0.9.3"
3+
version = "0.9.5"
44
description = "Command-line tool and Python library to efficiently diff rows across two different databases."
55
authors = ["Datafold <data-diff@datafold.com>"]
66
license = "MIT"

tests/test_joindiff.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
db.MySQL,
2323
db.Snowflake,
2424
db.BigQuery,
25+
db.DuckDB,
2526
db.Oracle,
2627
db.Redshift,
2728
db.Presto,
@@ -32,7 +33,7 @@
3233
test_each_database = test_each_database_in_list(TEST_DATABASES)
3334

3435

35-
@test_each_database_in_list({db.Snowflake, db.BigQuery})
36+
@test_each_database_in_list({db.Snowflake, db.BigQuery, db.DuckDB})
3637
class TestCompositeKey(DiffTestCase):
3738
src_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime}
3839
dst_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime}

0 commit comments

Comments
 (0)