diff --git a/bigquery/README.md b/bigquery/README.md index cdf99566e..9079561d3 100644 --- a/bigquery/README.md +++ b/bigquery/README.md @@ -1,13 +1,4 @@ -As of 2025, Google Bigquery allow publishing benchmark results, which was not the case earlier. - -It's very difficult to find, how to create a database. -Databases are named "datasets". You need to press on `⋮` near project. - -Create dataset `test`. -Go to the query editor and paste the contents of `create.sql`. -It will take two seconds to create a table. - -Download Google Cloud CLI: +Download Google Cloud CLI and configure your project settings. You can skip this step if you are using [Cloud shell](https://docs.cloud.google.com/shell/docs/launching-cloud-shell): ``` wget --continue --progress=dot:giga https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz tar -xf google-cloud-cli-linux-x86_64.tar.gz @@ -16,7 +7,12 @@ source .bashrc ./google-cloud-sdk/bin/gcloud init ``` -Load the data: +Create the dataset and table in BigQuery: +``` +./create.sh +``` + +Load the data in the table: ``` wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz' gzip -d -f hits.csv.gz @@ -26,13 +22,7 @@ command time -f '%e' bq load --source_format CSV --allow_quoted_newlines=1 test. ``` Run the benchmark: - ``` -./run.sh 2>&1 | tee log.txt - -cat log.txt | - grep -P '^real|^Error' | - sed -r -e 's/^Error.*$/null/; s/^real\s*([0-9.]+)m([0-9.]+)s$/\1 \2/' | - awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' | - awk '{ if ($1 == "null") { skip = 1 } else { if (i % 3 == 0) { printf "[" }; printf skip ? "null" : $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; skip = 0; } }' +pip install google-cloud-bigquery +python3 run_queries.py > results.txt 2> log.txt ``` diff --git a/bigquery/create.sh b/bigquery/create.sh new file mode 100755 index 000000000..6f9c26788 --- /dev/null +++ b/bigquery/create.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +bq mk --dataset test + +bq query --use_legacy_sql=false < create.sql diff --git a/bigquery/create.sql b/bigquery/create.sql index 9012df89a..63ebbcd2e 100644 --- a/bigquery/create.sql +++ b/bigquery/create.sql @@ -104,5 +104,6 @@ CREATE TABLE test.hits HasGCLID SMALLINT NOT NULL, RefererHash BIGINT NOT NULL, URLHash BIGINT NOT NULL, - CLID INTEGER NOT NULL + CLID INTEGER NOT NULL, + PRIMARY KEY (CounterID, EventDate, UserID, EventTime, WatchID) NOT ENFORCED ); diff --git a/bigquery/queries.sql b/bigquery/queries.sql index 3dc8f405a..a97258890 100644 --- a/bigquery/queries.sql +++ b/bigquery/queries.sql @@ -26,7 +26,7 @@ SELECT SearchPhrase FROM test.hits WHERE SearchPhrase <> '' ORDER BY EventTime L SELECT SearchPhrase FROM test.hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; SELECT SearchPhrase FROM test.hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM test.hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM test.hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM test.hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM test.hits; SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM test.hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM test.hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; diff --git a/bigquery/results/result.json b/bigquery/results/result.json index c46efe88b..9875c53d1 100644 --- a/bigquery/results/result.json +++ b/bigquery/results/result.json @@ -1,6 +1,6 @@ { "system": "Bigquery", - "date": "2025-04-09", + "date": "2025-10-28", "machine": "serverless", "cluster_size": "serverless", "proprietary": "yes", @@ -9,52 +9,52 @@ "tags": ["serverless", "column-oriented", "gcp", "managed"], - "load_time": 1146, + "load_time": 776.91, "data_size": 8760000000, "result": [ -[4.862,4.001,3.921], -[4.268,4.113,4.467], -[4.341,4.15,4.219], -[4.124,3.996,4.337], -[4.553,4.36,4.349], -[4.565,4.4,4.661], -[4.089,4.132,3.974], -[4.514,4.296,4.312], -[6.183,6.155,4.557], -[6.068,6.106,6.259], -[4.109,4.082,4.165], -[4.24,3.981,4.054], -[4.295,4.301,4.283], -[6.03,6.079,6.094], -[4.383,4.399,4.218], -[4.304,4.23,4.189], -[4.849,4.86,4.62], -[4.309,4.371,4.393], -[6.096,6.109,6.071], -[3.838,3.89,3.938], -[4.249,4.037,4.136], -[4.337,4.196,4.264], -[4.493,4.603,4.435], -[6.125,4.667,4.559], -[4.039,4.039,3.942], -[3.903,4.239,4.003], -[4.013,4.108,4.073], -[4.524,4.474,4.498], -[null,null,null], -[4.866,4.862,6.063], -[4.271,4.403,4.34], -[4.39,4.314,4.566], -[7.233,7.322,7.241], -[7.39,7.382,7.298], -[6.05,6.084,6.362], -[4.31,4.222,4.254], -[4.181,4.003,3.95], -[3.98,3.988,3.982], -[4.017,4.004,3.987], -[4.334,4.322,4.445], -[4.126,3.853,3.982], -[4.214,3.931,3.921], -[4.033,3.913,3.866] +[0.383933,0.402355,0.370758], +[0.334439,0.433776,0.416341], +[0.469506,0.359557,0.386433], +[0.491417,0.333208,0.4758], +[0.552464,0.652322,0.555889], +[0.581302,0.603089,0.674999], +[1.087835,0.639649,0.360542], +[0.438221,0.759105,0.497731], +[0.702109,0.712533,0.678109], +[0.857454,0.968303,0.995039], +[0.547042,0.479513,0.475109], +[0.547026,0.549529,0.614708], +[0.686315,0.580551,0.630673], +[1.792573,2.034019,1.845895], +[0.610674,0.677655,0.643796], +[0.580303,0.729024,0.622044], +[0.760401,0.809858,0.822725], +[0.721757,0.611165,0.744566], +[1.49368,1.372045,1.498892], +[0.363523,0.383959,0.366856], +[0.625735,0.49802,0.473233], +[0.513777,0.508772,0.527258], +[0.895406,0.874879,0.799704], +[0.909036,0.679151,0.730413], +[0.358434,0.509104,0.467827], +[0.421586,0.428603,0.33761], +[0.54752,0.364919,0.444499], +[0.691434,0.674469,0.930067], +[1.143579,1.034013,1.105913], +[0.569294,0.444362,0.463864], +[0.517151,0.53565,0.523663], +[0.56208,0.573,0.543899], +[1.409102,1.116484,1.295522], +[1.413902,1.346194,1.406088], +[1.068575,0.985308,1.194028], +[0.781501,0.524615,0.664192], +[0.678144,0.666519,0.548661], +[0.477265,0.445584,0.469621], +[0.554599,0.530927,0.551336], +[0.777017,0.696796,0.810055], +[0.427604,0.43113,0.449339], +[0.434927,0.407959,0.435918], +[0.478507,0.425838,0.541504] ] } diff --git a/bigquery/run_queries.py b/bigquery/run_queries.py new file mode 100644 index 000000000..76d75e38c --- /dev/null +++ b/bigquery/run_queries.py @@ -0,0 +1,68 @@ +from google.cloud import bigquery +from google.cloud.bigquery.enums import JobCreationMode + +import sys +from typing import TextIO, Any +from datetime import datetime + +def log(*objects: Any, sep: str = ' ', end: str = '\n', file: TextIO = sys.stderr, severity: str = 'INFO') -> None: + """ + Mimics the built-in print() function signature but prepends a + timestamp and a configurable severity level to the output. + + Args: + *objects: The objects to be printed (converted to strings). + sep (str): Separator inserted between values, default a space. + end (str): String appended after the last value, default a newline. + file (TextIO): Object with a write(string) method, default sys.stdout. + severity (str): The log level (e.g., "INFO", "WARNING", "ERROR"). + """ + # 1. Prepare the standard print content + # Use an f-string to join the objects with the specified separator + message = sep.join(str(obj) for obj in objects) + + # 2. Prepare the log prefix + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + prefix = f"[{timestamp}] [{severity.upper()}]: " + + # 3. Combine the prefix and the message + full_message = prefix + message + + # 4. Use the file.write method to output the content + # The 'end' argument is handled explicitly here + file.write(full_message + end) + + # Ensure the buffer is flushed (important for file/stream output) + if file is not sys.stdout and file is not sys.stderr: + file.flush() + + +job_config = bigquery.QueryJobConfig() +job_config.use_query_cache = False +client = bigquery.Client( + default_job_creation_mode=JobCreationMode.JOB_CREATION_OPTIONAL +) + +file = open('queries.sql', 'r') +TRIES = 3 +for query in file: + query = query.strip() + print("[", end='') + for i in range(TRIES): + log(f"\n[{i}]: {query}") + try: + client_start_time = datetime.now() + results = client.query_and_wait(query, job_config=job_config) + client_end_time = datetime.now() + + client_time = client_end_time - client_start_time + client_time_secs = client_time.total_seconds() + endstr = "],\n" if i == 2 else "," + print(f"{client_time_secs}", end=endstr) + + log(f"Job ID: **{results.job_id}**") + log(f"Query ID: **{results.query_id}**") + log(f"Client time: **{client_time}**") + + except Exception as e: + log(f"Job failed with error: {e}", severity="ERROR")