Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 791369a

Browse files
committed
Merge branch 'master' into daniel-dx-666-improve-dbt-cloud-event-metadata
2 parents da1a0af + 2d5db0f commit 791369a

26 files changed

+1116
-445
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# if a comment is added to an issue awaiting response, remove the "awaiting_response" label and add a "triage" label
2+
3+
name: Awaiting Response --> Triage Label
4+
5+
on: issue_comment
6+
7+
defaults:
8+
run:
9+
shell: bash
10+
11+
permissions:
12+
issues: write
13+
14+
jobs:
15+
triage_label:
16+
if: contains(github.event.issue.labels.*.name, 'awaiting_response')
17+
runs-on: ubuntu-latest
18+
steps:
19+
- name: initial labeling
20+
uses: andymckay/labeler@master
21+
with:
22+
add-labels: "triage"
23+
remove-labels: "awaiting_response"
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# if a comment is added to a stale issue, remove the "stale" label and add a "triage" label
2+
3+
name: Stale --> Triage Label
4+
5+
on: issue_comment
6+
7+
defaults:
8+
run:
9+
shell: bash
10+
11+
permissions:
12+
issues: write
13+
14+
jobs:
15+
triage_label:
16+
if: contains(github.event.issue.labels.*.name, 'stale')
17+
runs-on: ubuntu-latest
18+
steps:
19+
- name: initial labeling
20+
uses: andymckay/labeler@master
21+
with:
22+
add-labels: "triage"
23+
remove-labels: "stale"

.github/workflows/stale.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
2+
#
3+
# You can adjust the behavior by modifying this file.
4+
# For more information, see:
5+
# https://github.com/actions/stale
6+
name: Mark stale issues and pull requests
7+
8+
on:
9+
schedule:
10+
- cron: '28 6 * * *'
11+
12+
jobs:
13+
stale:
14+
15+
runs-on: ubuntu-latest
16+
permissions:
17+
issues: write
18+
pull-requests: write
19+
20+
steps:
21+
- uses: actions/stale@v5
22+
with:
23+
repo-token: ${{ secrets.GITHUB_TOKEN }}
24+
stale-issue-message: 'This issue has been marked as stale because it has been open for 60 days with no activity. If you would like the issue to remain open, please comment on the issue and it will be added to the triage queue. Otherwise, it will be closed in 7 days.'
25+
stale-pr-message: 'This pull request has been marked as stale because it has been open for 60 days with no activity. If you would like the pull request to remain open, please comment on the pull request and it will be added to the triage queue. Otherwise, it will be closed in 7 days.'
26+
stale-issue-label: 'stale'
27+
stale-pr-label: 'stale'
28+
close-issue-message: "Although we are closing this issue as stale, it's not gone forever. Issues can be reopened if there is renewed community interest. Just add a comment and it will be reopened for triage."
29+
close-pr-message: "Although we are closing this pull request as stale, it's not gone forever. PRs can be reopened if there is renewed community interest. Just add a comment and it will be reopened for triage."
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: Label issues
2+
on:
3+
issues:
4+
types:
5+
- reopened
6+
- opened
7+
jobs:
8+
label_issues:
9+
runs-on: ubuntu-latest
10+
permissions:
11+
issues: write
12+
steps:
13+
- uses: actions/github-script@v6
14+
with:
15+
script: |
16+
github.rest.issues.addLabels({
17+
issue_number: context.issue.number,
18+
owner: context.repo.owner,
19+
repo: context.repo.repo,
20+
labels: ["triage"]
21+
})

README.md

Lines changed: 61 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -2,136 +2,111 @@
22
<img alt="Datafold" src="https://user-images.githubusercontent.com/1799931/196497110-d3de1113-a97f-4322-b531-026d859b867a.png" width="50%" />
33
</p>
44

5-
# **data-diff**
5+
<h1 align="center">
6+
data-diff
7+
</h1>
8+
9+
<h2 align="center">
10+
Develop dbt models faster by testing as you code.
11+
</h2>
12+
<h4 align="center">
13+
See how every change to dbt code affects the data produced in the modified model and downstream.
14+
</h4>
15+
<br>
616

717
## What is `data-diff`?
8-
data-diff is a **free, open-source tool** that enables data professionals to detect differences in values between any two tables.
918

10-
## Documentation
19+
data-diff is an open source package that you can use to see the impact of your dbt code changes on your dbt models as you code.
1120

12-
[**🗎 Documentation**](https://docs.datafold.com/guides/os_data_diff) - our detailed documentation has everything you need to start diffing.
21+
<div align="center">
1322

14-
### Databases we support
23+
![development_testing_gif](https://user-images.githubusercontent.com/1799931/236354286-d1d044cf-2168-4128-8a21-8c8ca7fd494c.gif)
1524

16-
- PostgreSQL >=10
17-
- MySQL
18-
- Snowflake
19-
- BigQuery
20-
- Redshift
21-
- Oracle
22-
- Presto
23-
- Databricks
24-
- Trino
25-
- Clickhouse
26-
- Vertica
27-
- DuckDB >=0.6
28-
- SQLite (coming soon)
25+
</div>
2926

30-
For their corresponding connection strings, check out our [detailed table](https://github.com/datafold/data-diff/blob/master/docs/supported-databases.md).
27+
<br>
3128

32-
#### Looking for a database not on the list?
33-
If a database is not on the list, we'd still love to support it. [Please open an issue](https://github.com/datafold/data-diff/issues) to discuss it, or vote on existing requests to push them up our todo list.
29+
:eyes: **Watch 4-min demo video [here](https://www.loom.com/share/ad3df969ba6b4298939efb2fbcc14cde)**
3430

35-
## Get started
31+
## Getting Started
3632

37-
### Installation
33+
**Install `data-diff`**
3834

39-
#### First, install `data-diff` using `pip`.
35+
Install `data-diff` with the command that is specific to the database you use with dbt.
4036

37+
### Snowflake
4138
```
42-
pip install data-diff
39+
pip install data-diff 'data-diff[snowflake,dbt]' -U
4340
```
4441

45-
#### Then, install one or more driver(s) specific to the database(s) you want to connect to.
46-
47-
- `pip install 'data-diff[mysql]'`
48-
49-
- `pip install 'data-diff[postgresql]'`
50-
51-
- `pip install 'data-diff[snowflake]'`
52-
53-
- `pip install 'data-diff[presto]'`
54-
55-
- `pip install 'data-diff[oracle]'`
56-
57-
- `pip install 'data-diff[trino]'`
58-
59-
- `pip install 'data-diff[clickhouse]'`
60-
61-
- `pip install 'data-diff[vertica]'`
62-
63-
- For BigQuery, see: https://pypi.org/project/google-cloud-bigquery/
64-
65-
_Some drivers have dependencies that cannot be installed using `pip` and still need to be installed manually._
66-
67-
### Run your first diff
42+
### BigQuery
43+
```
44+
pip install data-diff 'data-diff[dbt]' google-cloud-bigquery -U
45+
```
6846

69-
Once you've installed `data-diff`, you can run it from the command line.
47+
### Redshift
48+
```
49+
pip install data-diff 'data-diff[redshift,dbt]' -U
50+
```
7051

52+
### Postgres
7153
```
72-
data-diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]
54+
pip install data-diff 'data-diff[postgres,dbt]' -U
7355
```
7456

75-
Be sure to read [the docs](https://docs.datafold.com/reference/open_source/cli) for detailed instructions how to build one of these commands depending on your database setup.
57+
### Databricks
58+
```
59+
pip install data-diff 'data-diff[databricks,dbt]' -U
60+
```
7661

77-
#### Code Example: Diff Tables Between Databases
78-
Here's an example command for your copy/pasting, taken from the screenshot above when we diffed data between Snowflake and Postgres.
62+
### DuckDB
63+
```
64+
pip install data-diff 'data-diff[duckdb,dbt]' -U
65+
```
7966

67+
**Update a few lines in your `dbt_project.yml`**.
8068
```
81-
data-diff \
82-
postgresql://<username>:'<password>'@localhost:5432/<database> \
83-
<table> \
84-
"snowflake://<username>:<password>@<password>/<DATABASE>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<ROLE>" \
85-
<TABLE> \
86-
-k activity_id \
87-
-c activity \
88-
-w "event_timestamp < '2022-10-10'"
69+
#dbt_project.yml
70+
vars:
71+
data_diff:
72+
prod_database: my_database
73+
prod_schema: my_default_schema
8974
```
9075

91-
#### Code Example: Diff Tables Within a Database
76+
**Run your first data diff!**
9277

9378
```
94-
data-diff \
95-
"snowflake://<username>:<password>@<password>/<DATABASE>/<SCHEMA_1>?warehouse=<WAREHOUSE>&role=<ROLE>" <TABLE_1> \
96-
<SCHEMA_2>.<TABLE_2> \
97-
-k org_id \
98-
-c created_at -c is_internal \
99-
-w "org_id != 1 and org_id < 2000" \
100-
-m test_results_%t \
101-
--materialize-all-rows \
102-
--table-write-limit 10000
79+
dbt run && data-diff --dbt
10380
```
10481

105-
In both code examples, I've used `<>` carrots to represent values that **should be replaced with your values** in the database connection strings. For the flags (`-k`, `-c`, etc.), I opted for "real" values (`org_id`, `is_internal`) to give you a more realistic view of what your command will look like.
82+
We recommend you get started by walking through [our simple setup instructions](https://docs.datafold.com/development_testing/open_source) which contain examples and details.
83+
84+
Please reach out on the dbt Slack in [#tools-datafold](https://getdbt.slack.com/archives/C03D25A92UU) if you have any trouble whatsoever getting started!
10685

107-
### We're here to help!
86+
<br><br>
10887

109-
We're here to help! Please post any questions in [GitHub Discussions](https://github.com/datafold/data-diff/discussions).
88+
### Diffing between databases
11089

111-
## How to Use
90+
Check out our [documentation](https://docs.datafold.com/reference/open_source/cli) if you're looking to compare data across databases (for example, between Postgres and Snowflake).
11291

113-
* [Examples with dbt, joindiff, and hashdiff](https://docs.datafold.com/reference/open_source/cli#examples)
114-
* [Examples with Python](https://data-diff.readthedocs.io/en/latest/python-api.html)
115-
* [How to use with TOML configuration file](https://docs.datafold.com/reference/open_source/cli#toml-config-file)
92+
<br>
11693

117-
## How to Contribute
118-
* Feel free to open an issue or contribute to the project by working on an existing issue.
119-
* Please read the [contributing guidelines](https://github.com/datafold/data-diff/blob/master/CONTRIBUTING.md) to get started.
120-
* To add a new database driver, check out [docs](https://github.com/datafold/data-diff/blob/master/docs/new-database-driver-guide.rst).
94+
## Contributors
12195

122-
Big thanks to everyone who contributed so far:
96+
We thank everyone who contributed so far!
12397

12498
<a href="https://github.com/datafold/data-diff/graphs/contributors">
12599
<img src="https://contributors-img.web.app/image?repo=datafold/data-diff" />
126100
</a>
127101

128-
## Technical Explanation
129-
130-
Check out this [technical explanation](https://github.com/datafold/data-diff/blob/master/docs/technical-explanation.md) of how data-diff works.
102+
<br>
131103

132104
## Analytics
105+
133106
* [Usage Analytics & Data Privacy](https://github.com/datafold/data-diff/blob/master/docs/usage_analytics.md)
134107

108+
<br>
109+
135110
## License
136111

137112
This project is licensed under the terms of the [MIT License](https://github.com/datafold/data-diff/blob/master/LICENSE).

data_diff/__main__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,13 @@ def write_usage(self, prog: str, args: str = "", prefix: Optional[str] = None) -
228228
metavar="PATH",
229229
help="Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.",
230230
)
231+
@click.option(
232+
"--select",
233+
"-s",
234+
default=None,
235+
metavar="PATH",
236+
help="select dbt resources to compare using dbt selection syntax",
237+
)
231238
def main(conf, run, **kw):
232239
if kw["table2"] is None and kw["database2"]:
233240
# Use the "database table table" form
@@ -264,6 +271,7 @@ def main(conf, run, **kw):
264271
profiles_dir_override=kw["dbt_profiles_dir"],
265272
project_dir_override=kw["dbt_project_dir"],
266273
is_cloud=kw["cloud"],
274+
dbt_selection=kw["select"],
267275
)
268276
else:
269277
return _data_diff(**kw)
@@ -306,6 +314,7 @@ def _data_diff(
306314
cloud,
307315
dbt_profiles_dir,
308316
dbt_project_dir,
317+
select,
309318
threads1=None,
310319
threads2=None,
311320
__conf__=None,

data_diff/cloud/datafold_api.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ class TCloudApiDataDiff(pydantic.BaseModel):
103103
pk_columns: List[str]
104104
filter1: Optional[str] = None
105105
filter2: Optional[str] = None
106+
include_columns: Optional[List[str]]
107+
exclude_columns: Optional[List[str]]
106108

107109

108110
class TCloudApiOrgMeta(pydantic.BaseModel):
@@ -138,6 +140,8 @@ class TSummaryResultSchemaStats(pydantic.BaseModel):
138140
column_type_mismatches: int
139141
column_reorders: int
140142
column_counts: Tuple[int, int]
143+
column_type_differs: List[str]
144+
exclusive_columns: Tuple[List[str], List[str]]
141145

142146

143147
class TCloudApiDataDiffSummaryResult(pydantic.BaseModel):
@@ -202,6 +206,11 @@ def get_data_sources(self) -> List[TCloudApiDataSource]:
202206
rv.raise_for_status()
203207
return [TCloudApiDataSource(**item) for item in rv.json()]
204208

209+
def get_data_source(self, data_source_id: int) -> TCloudApiDataSource:
210+
rv = self.make_get_request(url=f"api/v1/data_sources/{data_source_id}")
211+
rv.raise_for_status()
212+
return TCloudApiDataSource(**rv.json())
213+
205214
def create_data_source(self, config: TDsConfig) -> TCloudApiDataSource:
206215
payload = config.dict()
207216
if config.type == "bigquery":

0 commit comments

Comments
 (0)