diff --git a/opendata-python/Makefile b/opendata-python/Makefile index 9c10732..c41119f 100644 --- a/opendata-python/Makefile +++ b/opendata-python/Makefile @@ -2,7 +2,9 @@ venv: pipenv install tox tox-pyenv twine test: venv - pipenv run tox + docker-compose -f docker-compose.test.yaml up -d + - pipenv run tox + docker-compose down build: venv pipenv run python setup.py sdist bdist_wheel diff --git a/opendata-python/README.md b/opendata-python/README.md index e0d3b94..5799e41 100644 --- a/opendata-python/README.md +++ b/opendata-python/README.md @@ -166,3 +166,65 @@ activities[99].metadata ... }} ``` + +### Connecting to a PostgreSQL database +Although having all the Open Data files available as plain files on your computer has advantages (especially for less tech-savvy users), querying the data is slow and can be complicated. +To overcome this, it is possible to store all the data in a [PostgreSQL](https://www.postgresql.org/) database as well. + +Setting up PostgreSQL (documentation [here](https://www.postgresql.org/docs/11/tutorial-install.html)) can be hassle, so there is a `docker-compose.yaml` included in this repository that *should* work out of the box by running `docker-compose up` in the directory where the file is stored. +I am not going into the rabbit hole of explaining how to install docker and docker-compose here (a quick search will yield enough results for that). One comment: On MacOS and Linux installation is mostly painless, on Windows it not always is and I would advice against using docker there. +As an alternative, you can use a local installation of PostgreSQL (assuming username=opendata, password=password, database name=opendata by default). + +When PostgreSQL is installed correctly and running, inserting data into the database is as easy as: +```python +from opendata import OpenData +from opendata.db.main import OpenDataDB +from opendata.models import LocalAthlete + +od = OpenData() +opendatadb = OpenDataDB() +opendatadb.create_tables() # This is only needed once + +athlete = od.get_remote_athlete('0031326c-e796-4f35-8f25-d3937edca90f') + +opendatadb.insert_athlete(athlete) +``` +Please note: This only inserts the athlete into the database, not the activities for this athlete. +To add al the activities too: +```python +for activity in athlete.activities(): + opendatadb.insert_activity(activity, athlete) +``` + +At this point there are 2 tables in the opendata database: "athletes" and "activities". +The database schemas for both tables can be viewed [here](opendata/db/models.py). + +If you are familiar with raw SQL you can query the database directly, but if you prefer to stay in Python land, I got you covered too: Under the hood this library uses the [SQLAlchemy](https://www.sqlalchemy.org/) ORM. +For some general documentation on how that works, see [here](https://docs.sqlalchemy.org/en/latest/orm/tutorial.html). +Querying the data is possible using SQLAlchemy's query language (documentation [here](https://docs.sqlalchemy.org/en/latest/orm/query.html)). + +For example, to get a count of all activities that have power: +```python +from opendata.db import models +from sqlalchemy.sql import not_ + +session = opendatadb.get_session() +session.query(models.Activities).filter(not_(models.Activities.power.all('nan'))).count() +``` + +Filters can be [chained](https://docs.sqlalchemy.org/en/latest/glossary.html#term-method-chaining) to apply multiple filters in one query: +```python +from datetime import datetime + +from opendata.db import models +from sqlalchemy.sql import not_ + +session = opendatadb.get_session() +session.query(models.Activities).filter(Activities.datetime <= datetime(2017, 1, 1)).\ + filter(not_(models.Activities.power.all('nan'))).count() +``` + +You can also query for nested keys/values in the metadata (stored in the "meta" column because SQLAlchemy uses the metadata column internally): +```python +session.query(models.Activity).filter(models.Activity.metrics.contains({'workout_time': '2703.00000'})).count() +``` diff --git a/opendata-python/docker-compose.test.yaml b/opendata-python/docker-compose.test.yaml new file mode 100644 index 0000000..401ca69 --- /dev/null +++ b/opendata-python/docker-compose.test.yaml @@ -0,0 +1,12 @@ +version: '3.3' + +services: + postgres: + image: postgres + restart: always + ports: + - "5433:5432" + environment: + POSTGRES_USER: opendata + POSTGRES_PASSWORD: password + POSTGRES_DB: opendata diff --git a/opendata-python/docker-compose.yaml b/opendata-python/docker-compose.yaml new file mode 100644 index 0000000..cfb17a5 --- /dev/null +++ b/opendata-python/docker-compose.yaml @@ -0,0 +1,14 @@ +version: '3.3' + +services: + postgres: + image: postgres + restart: always + ports: + - "5432:5432" + volumes: + - ./postgres-data:/var/lib/postgresql/data + environment: + POSTGRES_USER: opendata + POSTGRES_PASSWORD: password + POSTGRES_DB: opendata diff --git a/opendata-python/opendata/conf.py b/opendata-python/opendata/conf.py index b692b53..4d69e7e 100644 --- a/opendata-python/opendata/conf.py +++ b/opendata-python/opendata/conf.py @@ -28,5 +28,10 @@ data_prefix='data', metadata_prefix='metadata', datasets_prefix='datasets', - local_storage=config['Storage']['local_storage_path'] + local_storage=config['Storage']['local_storage_path'], + db_host='localhost', + db_port='5432', + db_user='opendata', + db_password='password', + db_name='opendata', ) diff --git a/opendata-python/opendata/db/__init__.py b/opendata-python/opendata/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/opendata-python/opendata/db/constants.py b/opendata-python/opendata/db/constants.py new file mode 100644 index 0000000..c890408 --- /dev/null +++ b/opendata-python/opendata/db/constants.py @@ -0,0 +1,11 @@ +csv_to_db_mapping = { + 'secs': 'time', + 'km': 'distance', + 'spd': 'speed', + 'power': 'power', + 'cad': 'cadence', + 'hr': 'heartrate', + 'alt': 'altitude', + 'slope': 'slope', + 'temp': 'temperature', +} diff --git a/opendata-python/opendata/db/main.py b/opendata-python/opendata/db/main.py new file mode 100644 index 0000000..181ea56 --- /dev/null +++ b/opendata-python/opendata/db/main.py @@ -0,0 +1,83 @@ +from contextlib import contextmanager + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from opendata.conf import settings +from opendata.utils import filename_to_datetime +from . import models +from .constants import csv_to_db_mapping + + +class OpenDataDB: + def __init__(self, host=settings.db_host, port=settings.db_port, + user=settings.db_user, password=settings.db_password, + database=settings.db_name): + self.host = host + self.port = port + self.user = user + self.password = password + self.database = database + self.Session = sessionmaker() + + def get_engine(self): + return create_engine( + f'postgres://{self.user}:{self.password}@{self.host}:{self.port}/{self.database}' + ) + + @contextmanager + def engine(self): + engine = self.get_engine() + yield engine + engine.dispose() + + def get_session(self): + return self.Session(bind=self.get_engine()) + + @contextmanager + def session(self): + session = self.get_session() + yield session + session.close() + + def create_tables(self): + with self.session() as session, self.engine() as engine: + models.Base.metadata.create_all(engine) + session.commit() + + def insert_athlete(self, athlete): + with self.session() as session: + session.add(models.Athlete( + id=athlete.id, + meta=athlete.metadata + )) + session.commit() + + def insert_activity(self, activity, athlete=None): + with self.session() as session: + if activity.metadata is not None \ + and 'METRICS' in activity.metadata: + metrics = activity.metadata.pop('METRICS') + else: + metrics = None + + db_activity = models.Activity( + id=activity.id, + datetime=filename_to_datetime(activity.id), + meta=activity.metadata, + metrics=metrics, + ) + + if athlete is not None: + db_activity.athlete = athlete.id + + for column in csv_to_db_mapping.keys(): + if column in activity.data: + setattr( + db_activity, + csv_to_db_mapping[column], + activity.data[column].values.tolist() + ) + + session.add(db_activity) + session.commit() diff --git a/opendata-python/opendata/db/models.py b/opendata-python/opendata/db/models.py new file mode 100644 index 0000000..a3c46e3 --- /dev/null +++ b/opendata-python/opendata/db/models.py @@ -0,0 +1,42 @@ +from sqlalchemy import Column, Float, ForeignKey, String +from sqlalchemy.dialects import postgresql +from sqlalchemy.types import DateTime +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import relationship + +Base = declarative_base() + + +class Athlete(Base): + __tablename__ = 'athletes' + + id = Column(String, primary_key=True) + meta = Column(postgresql.JSONB) + activities = relationship('Activity') + + def __repr__(self): + return f'