Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit e73c7fa

Browse files
pik94erezsh
authored andcommitted
Set ROUNDS_ON_PREC_LOSS flag to False for Clickhouse
1 parent f325f93 commit e73c7fa

File tree

2 files changed

+50
-57
lines changed

2 files changed

+50
-57
lines changed

data_diff/databases/clickhouse.py

Lines changed: 49 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,21 @@
22

33
import clickhouse_driver.dbapi.connection
44

5-
from .base import ThreadedDatabase, import_helper, ConnectError
6-
from .base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS
7-
from .database_types import (
8-
ColType, Decimal, Float, Integer, FractionalType, Native_UUID, TemporalType, Text, Timestamp
5+
from .base import (
6+
MD5_HEXDIGITS,
7+
CHECKSUM_HEXDIGITS,
8+
TIMESTAMP_PRECISION_POS,
9+
ThreadedDatabase,
10+
import_helper,
11+
ConnectError,
912
)
13+
from .database_types import ColType, Decimal, Float, Integer, FractionalType, Native_UUID, TemporalType, Text, Timestamp
1014

1115

1216
@import_helper("clickhouse")
1317
def import_clickhouse():
1418
import clickhouse_driver
19+
1520
return clickhouse_driver
1621

1722

@@ -24,42 +29,35 @@ def cursor(self, cursor_factory=None):
2429

2530
class Clickhouse(ThreadedDatabase):
2631
TYPE_CLASSES = {
27-
'Int8': Integer,
28-
'Int16': Integer,
29-
'Int32': Integer,
30-
'Int64': Integer,
31-
'Int128': Integer,
32-
'Int256': Integer,
33-
34-
'UInt8': Integer,
35-
'UInt16': Integer,
36-
'UInt32': Integer,
37-
'UInt64': Integer,
38-
'UInt128': Integer,
39-
'UInt256': Integer,
40-
41-
'Float32': Float,
42-
'Float64': Float,
43-
44-
'Decimal': Decimal,
45-
46-
'UUID': Native_UUID,
47-
48-
'String': Text,
49-
'FixedString': Text,
50-
51-
'DateTime': Timestamp,
52-
'DateTime64': Timestamp,
53-
32+
"Int8": Integer,
33+
"Int16": Integer,
34+
"Int32": Integer,
35+
"Int64": Integer,
36+
"Int128": Integer,
37+
"Int256": Integer,
38+
"UInt8": Integer,
39+
"UInt16": Integer,
40+
"UInt32": Integer,
41+
"UInt64": Integer,
42+
"UInt128": Integer,
43+
"UInt256": Integer,
44+
"Float32": Float,
45+
"Float64": Float,
46+
"Decimal": Decimal,
47+
"UUID": Native_UUID,
48+
"String": Text,
49+
"FixedString": Text,
50+
"DateTime": Timestamp,
51+
"DateTime64": Timestamp,
5452
}
55-
ROUNDS_ON_PREC_LOSS = True
53+
ROUNDS_ON_PREC_LOSS = False
5654

5755
def __init__(self, *, thread_count: int, **kw):
5856
super().__init__(thread_count=thread_count)
5957

6058
self._args = kw
6159
# In Clickhouse database and schema are the same
62-
self.default_schema = kw['database']
60+
self.default_schema = kw["database"]
6361

6462
def create_connection(self):
6563
clickhouse = import_clickhouse()
@@ -70,16 +68,16 @@ def create_connection(self):
7068
raise ConnectError(*e.args) from e
7169

7270
def _parse_type_repr(self, type_repr: str) -> Optional[Type[ColType]]:
73-
nullable_prefix = 'Nullable'
74-
if type_repr.lower().startswith(nullable_prefix.lower()):
75-
type_repr = type_repr[len(nullable_prefix):].lstrip('(').rstrip(')')
71+
nullable_prefix = "Nullable("
72+
if type_repr.startswith(nullable_prefix):
73+
type_repr = type_repr.replace("Nullable(", "").rstrip(")")
7674

77-
if type_repr.startswith('Decimal'):
78-
type_repr = 'Decimal'
79-
elif type_repr.startswith('FixedString'):
80-
type_repr = 'FixedString'
81-
elif type_repr.startswith('DateTime64'):
82-
type_repr = 'DateTime64'
75+
if type_repr.startswith("Decimal"):
76+
type_repr = "Decimal"
77+
elif type_repr.startswith("FixedString"):
78+
type_repr = "FixedString"
79+
elif type_repr.startswith("DateTime64"):
80+
type_repr = "DateTime64"
8381

8482
return self.TYPE_CLASSES.get(type_repr)
8583

@@ -88,19 +86,21 @@ def quote(self, s: str) -> str:
8886

8987
def md5_to_int(self, s: str) -> str:
9088
substr_idx = 1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS
91-
return f'reinterpretAsUInt128(reverse(unhex(lowerUTF8(substr(hex(MD5({s})), {substr_idx})))))'
89+
return f"reinterpretAsUInt128(reverse(unhex(lowerUTF8(substr(hex(MD5({s})), {substr_idx})))))"
9290

9391
def to_string(self, s: str) -> str:
9492
return f"toString({s})"
9593

9694
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
95+
prec= coltype.precision
9796
if coltype.rounds:
98-
prec = coltype.precision
99-
timestamp = f'toDateTime64(round(toUnixTimestamp64Micro(toDateTime64({value}, 6)) / 1000000, {prec}), 6)'
97+
timestamp = f"toDateTime64(round(toUnixTimestamp64Micro(toDateTime64({value}, 6)) / 1000000, {prec}), 6)"
10098
return self.to_string(timestamp)
101-
else:
102-
fractional = f'toUnixTimestamp64Micro(toDateTime64({value}, 6)) % 1000000'
103-
return f"formatDateTime({value}, '%Y-%m-%d %H:%M:%S') || '.' || {self.to_string(fractional)}"
99+
100+
fractional = f"toUnixTimestamp64Micro(toDateTime64({value}, {prec})) % 1000000"
101+
fractional = f"lpad({self.to_string(fractional)}, 6, '0')"
102+
value = f"formatDateTime({value}, '%Y-%m-%d %H:%M:%S') || '.' || {self.to_string(fractional)}"
103+
return f"rpad({value}, {TIMESTAMP_PRECISION_POS + 6}, '0')"
104104

105105
def _convert_db_precision_to_digits(self, p: int) -> int:
106106
# Done the same as for PostgreSQL but need to rewrite in another way
@@ -125,7 +125,7 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str:
125125
# with length = digits in an integer part + 1 (symbol of ".") + precision
126126

127127
if coltype.precision == 0:
128-
return self.to_string(f'round({value})')
128+
return self.to_string(f"round({value})")
129129

130130
precision = coltype.precision
131131
# TODO: too complex, is there better performance way?

tests/test_database_types.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -480,14 +480,7 @@ def _insert_to_table(conn, table, values, type):
480480

481481
elif isinstance(conn, db.Clickhouse):
482482
if type.startswith("DateTime64"):
483-
# Clickhouse does not round microseconds when inserting in contrast to PostgreSQL and MySQL.
484-
# For example, if we have '2022-06-01 15:10:05.009900' and want to store it with precision 3,
485-
# Clickhouse will store it as '2022-06-01 15:10:05.009'
486-
# Postgres/MySQL as '2022-06-01 15:10:05.010'
487-
sample = sample.replace(tzinfo=None)
488-
precision = int(type[11:].rstrip(')'))
489-
microsecond = round(round(sample.microsecond / 1_000_000, precision) * 1_000_000, 6)
490-
value = f"'{sample.replace(microsecond=int(microsecond))}'"
483+
value = f"'{sample.replace(tzinfo=None)}'"
491484

492485
elif type == 'DateTime':
493486
sample = sample.replace(tzinfo=None)

0 commit comments

Comments
 (0)