Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 60b050f

Browse files
authored
Merge pull request #226 from datafold/pr217_adjustments
PR #217 adjustments
2 parents f2b62e8 + 48a2072 commit 60b050f

File tree

12 files changed

+356
-193
lines changed

12 files changed

+356
-193
lines changed

.github/workflows/ci.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
matrix:
1919
os: [ubuntu-latest]
2020
python-version:
21-
- "3.7.1"
21+
- "3.7"
2222
- "3.8"
2323
- "3.9"
2424
- "3.10"
@@ -34,7 +34,7 @@ jobs:
3434
python-version: ${{ matrix.python-version }}
3535

3636
- name: Build the stack
37-
run: docker-compose up -d mysql postgres presto trino
37+
run: docker-compose up -d mysql postgres presto trino clickhouse
3838

3939
- name: Install Poetry
4040
run: pip install poetry
@@ -47,4 +47,5 @@ jobs:
4747
DATADIFF_SNOWFLAKE_URI: '${{ secrets.DATADIFF_SNOWFLAKE_URI }}'
4848
DATADIFF_PRESTO_URI: '${{ secrets.DATADIFF_PRESTO_URI }}'
4949
DATADIFF_TRINO_URI: '${{ secrets.DATADIFF_TRINO_URI }}'
50+
DATADIFF_CLICKHOUSE_URI: 'clickhouse://clickhouse:Password1@localhost:9000/clickhouse'
5051
run: poetry run unittest-parallel -j 16

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ $ data-diff \
119119

120120
| Database | Connection string | Status |
121121
|---------------|-------------------------------------------------------------------------------------------------------------------------------------|--------|
122-
| PostgreSQL >=10 | `postgresql://<user>:<password>@<host>:5432/<database>` | 💚 |
122+
| PostgreSQL >=10 | `postgresql://<user>:<password>@<host>:5432/<database>` | 💚 |
123123
| MySQL | `mysql://<user>:<password>@<hostname>:5432/<database>` | 💚 |
124124
| Snowflake | `"snowflake://<user>[:<password>]@<account>/<database>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<role>[&authenticator=externalbrowser]"` | 💚 |
125125
| Oracle | `oracle://<username>:<password>@<hostname>/database` | 💛 |
@@ -128,9 +128,9 @@ $ data-diff \
128128
| Presto | `presto://<username>:<password>@<hostname>:8080/<database>` | 💛 |
129129
| Databricks | `databricks://<http_path>:<access_token>@<server_hostname>/<catalog>/<schema>` | 💛 |
130130
| Trino | `trino://<username>:<password>@<hostname>:8080/<database>` | 💛 |
131+
| Clickhouse | `clickhouse://<username>:<password>@<hostname>:9000/<database>` | 💛 |
131132
| ElasticSearch | | 📝 |
132133
| Planetscale | | 📝 |
133-
| Clickhouse | | 📝 |
134134
| Pinot | | 📝 |
135135
| Druid | | 📝 |
136136
| Kafka | | 📝 |
@@ -171,6 +171,8 @@ While you may install them manually, we offer an easy way to install them along
171171

172172
- `pip install 'data-diff[trino]'`
173173

174+
- `pip install 'data-diff[clickhouse]'`
175+
174176
- For BigQuery, see: https://pypi.org/project/google-cloud-bigquery/
175177

176178

data_diff/databases/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@
99
from .presto import Presto
1010
from .databricks import Databricks
1111
from .trino import Trino
12+
from .clickhouse import Clickhouse
1213

1314
from .connect import connect_to_uri

data_diff/databases/clickhouse.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
from typing import Optional, Type
2+
3+
from .base import (
4+
MD5_HEXDIGITS,
5+
CHECKSUM_HEXDIGITS,
6+
TIMESTAMP_PRECISION_POS,
7+
ThreadedDatabase,
8+
import_helper,
9+
ConnectError,
10+
)
11+
from .database_types import ColType, Decimal, Float, Integer, FractionalType, Native_UUID, TemporalType, Text, Timestamp
12+
13+
14+
@import_helper("clickhouse")
15+
def import_clickhouse():
16+
import clickhouse_driver
17+
18+
return clickhouse_driver
19+
20+
21+
class Clickhouse(ThreadedDatabase):
22+
TYPE_CLASSES = {
23+
"Int8": Integer,
24+
"Int16": Integer,
25+
"Int32": Integer,
26+
"Int64": Integer,
27+
"Int128": Integer,
28+
"Int256": Integer,
29+
"UInt8": Integer,
30+
"UInt16": Integer,
31+
"UInt32": Integer,
32+
"UInt64": Integer,
33+
"UInt128": Integer,
34+
"UInt256": Integer,
35+
"Float32": Float,
36+
"Float64": Float,
37+
"Decimal": Decimal,
38+
"UUID": Native_UUID,
39+
"String": Text,
40+
"FixedString": Text,
41+
"DateTime": Timestamp,
42+
"DateTime64": Timestamp,
43+
}
44+
ROUNDS_ON_PREC_LOSS = False
45+
46+
def __init__(self, *, thread_count: int, **kw):
47+
super().__init__(thread_count=thread_count)
48+
49+
self._args = kw
50+
# In Clickhouse database and schema are the same
51+
self.default_schema = kw["database"]
52+
53+
def create_connection(self):
54+
clickhouse = import_clickhouse()
55+
56+
class SingleConnection(clickhouse.dbapi.connection.Connection):
57+
"""Not thread-safe connection to Clickhouse"""
58+
59+
def cursor(self, cursor_factory=None):
60+
if not len(self.cursors):
61+
_ = super().cursor()
62+
return self.cursors[0]
63+
64+
try:
65+
return SingleConnection(**self._args)
66+
except clickhouse.OperationError as e:
67+
raise ConnectError(*e.args) from e
68+
69+
def _parse_type_repr(self, type_repr: str) -> Optional[Type[ColType]]:
70+
nullable_prefix = "Nullable("
71+
if type_repr.startswith(nullable_prefix):
72+
type_repr = type_repr[len(nullable_prefix):].rstrip(")")
73+
74+
if type_repr.startswith("Decimal"):
75+
type_repr = "Decimal"
76+
elif type_repr.startswith("FixedString"):
77+
type_repr = "FixedString"
78+
elif type_repr.startswith("DateTime64"):
79+
type_repr = "DateTime64"
80+
81+
return self.TYPE_CLASSES.get(type_repr)
82+
83+
def quote(self, s: str) -> str:
84+
return f'"{s}"'
85+
86+
def md5_to_int(self, s: str) -> str:
87+
substr_idx = 1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS
88+
return f"reinterpretAsUInt128(reverse(unhex(lowerUTF8(substr(hex(MD5({s})), {substr_idx})))))"
89+
90+
def to_string(self, s: str) -> str:
91+
return f"toString({s})"
92+
93+
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
94+
prec= coltype.precision
95+
if coltype.rounds:
96+
timestamp = f"toDateTime64(round(toUnixTimestamp64Micro(toDateTime64({value}, 6)) / 1000000, {prec}), 6)"
97+
return self.to_string(timestamp)
98+
99+
fractional = f"toUnixTimestamp64Micro(toDateTime64({value}, {prec})) % 1000000"
100+
fractional = f"lpad({self.to_string(fractional)}, 6, '0')"
101+
value = f"formatDateTime({value}, '%Y-%m-%d %H:%M:%S') || '.' || {self.to_string(fractional)}"
102+
return f"rpad({value}, {TIMESTAMP_PRECISION_POS + 6}, '0')"
103+
104+
def _convert_db_precision_to_digits(self, p: int) -> int:
105+
# Done the same as for PostgreSQL but need to rewrite in another way
106+
# because it does not help for float with a big integer part.
107+
return super()._convert_db_precision_to_digits(p) - 2
108+
109+
def normalize_number(self, value: str, coltype: FractionalType) -> str:
110+
# If a decimal value has trailing zeros in a fractional part, when casting to string they are dropped.
111+
# For example:
112+
# select toString(toDecimal128(1.10, 2)); -- the result is 1.1
113+
# select toString(toDecimal128(1.00, 2)); -- the result is 1
114+
# So, we should use some custom approach to save these trailing zeros.
115+
# To avoid it, we can add a small value like 0.000001 to prevent dropping of zeros from the end when casting.
116+
# For examples above it looks like:
117+
# select toString(toDecimal128(1.10, 2 + 1) + toDecimal128(0.001, 3)); -- the result is 1.101
118+
# After that, cut an extra symbol from the string, i.e. 1.101 -> 1.10
119+
# So, the algorithm is:
120+
# 1. Cast to decimal with precision + 1
121+
# 2. Add a small value 10^(-precision-1)
122+
# 3. Cast the result to string
123+
# 4. Drop the extra digit from the string. To do that, we need to slice the string
124+
# with length = digits in an integer part + 1 (symbol of ".") + precision
125+
126+
if coltype.precision == 0:
127+
return self.to_string(f"round({value})")
128+
129+
precision = coltype.precision
130+
# TODO: too complex, is there better performance way?
131+
value = f"""
132+
if({value} >= 0, '', '-') || left(
133+
toString(
134+
toDecimal128(
135+
round(abs({value}), {precision}),
136+
{precision} + 1
137+
)
138+
+
139+
toDecimal128(
140+
exp10(-{precision + 1}),
141+
{precision} + 1
142+
)
143+
),
144+
toUInt8(
145+
greatest(
146+
floor(log10(abs({value}))) + 1,
147+
1
148+
)
149+
) + 1 + {precision}
150+
)
151+
"""
152+
return value

data_diff/databases/connect.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from .presto import Presto
1515
from .databricks import Databricks
1616
from .trino import Trino
17+
from .clickhouse import Clickhouse
1718

1819

1920
@dataclass
@@ -85,6 +86,7 @@ def match_path(self, dsn):
8586
help_str="databricks://:access_token@server_name/http_path",
8687
),
8788
"trino": MatchUriPath(Trino, ["catalog", "schema"], help_str="trino://<user>@<host>/<catalog>/<schema>"),
89+
"clickhouse": MatchUriPath(Clickhouse, ["database?"], help_str="clickhouse://<user>:<pass>@<host>/<database>"),
8890
}
8991

9092

@@ -110,6 +112,7 @@ def connect_to_uri(db_uri: str, thread_count: Optional[int] = 1) -> Database:
110112
- presto
111113
- databricks
112114
- trino
115+
- clickhouse
113116
"""
114117

115118
dsn = dsnparse.parse(db_uri)

data_diff/table_segment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def count_and_checksum(self) -> Tuple[int, int]:
218218

219219
if count:
220220
assert checksum, (count, checksum)
221-
return count or 0, checksum if checksum is None else int(checksum)
221+
return count or 0, int(checksum) if count else None
222222

223223
def query_key_range(self) -> Tuple[int, int]:
224224
"""Query database for minimum and maximum key. This is used for setting the initial bounds."""

dev/dev.env

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,9 @@ POSTGRES_DB=postgres
55
MYSQL_DATABASE=mysql
66
MYSQL_USER=mysql
77
MYSQL_PASSWORD=Password1
8-
MYSQL_ROOT_PASSWORD=RootPassword1
8+
MYSQL_ROOT_PASSWORD=RootPassword1
9+
10+
CLICKHOUSE_USER=clickhouse
11+
CLICKHOUSE_PASSWORD=Password1
12+
CLICKHOUSE_DB=clickhouse
13+
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1

docker-compose.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,29 @@ services:
5151
networks:
5252
- local
5353

54+
clickhouse:
55+
container_name: clickhouse
56+
image: clickhouse/clickhouse-server:21.12.3.32
57+
restart: always
58+
volumes:
59+
- clickhouse-data:/var/lib/clickhouse:delegated
60+
ulimits:
61+
nproc: 65535
62+
nofile:
63+
soft: 262144
64+
hard: 262144
65+
ports:
66+
- '8123:8123'
67+
- '9000:9000'
68+
expose:
69+
- '8123'
70+
- '9000'
71+
env_file:
72+
- dev/dev.env
73+
tty: true
74+
networks:
75+
- local
76+
5477
# prestodb.dbapi.connect(host="127.0.0.1", user="presto").cursor().execute('SELECT * FROM system.runtime.nodes')
5578
presto:
5679
build:
@@ -77,6 +100,7 @@ services:
77100
volumes:
78101
postgresql-data:
79102
mysql-data:
103+
clickhouse-data:
80104

81105
networks:
82106
local:

0 commit comments

Comments
 (0)