diff --git a/pysatTutorials/pysat_data_loading_and_iteration.ipynb b/pysatTutorials/pysat_data_loading_and_iteration.ipynb new file mode 100644 index 0000000..291e835 --- /dev/null +++ b/pysatTutorials/pysat_data_loading_and_iteration.ipynb @@ -0,0 +1,695 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime as dt\n", + "import numpy as np\n", + "import pandas as pds\n", + "import time\n", + "\n", + "import pysat\n", + "\n", + "def print_range(inst):\n", + " \"\"\"Print loaded data range\"\"\"\n", + " print('Loaded Data Range: ')\n", + " print(inst.index[0], ' --- ', inst.index[-1], '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The following set up a few test conditions, run one, then execute the 'Testing Load Behaviors' cell below.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [], + "source": [ + "# basic instrument setup\n", + "inst = pysat.Instrument('pysat', 'testing')\n", + "verify = False" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# instrument setup with data padding\n", + "inst = pysat.Instrument('pysat', 'testing', pad={'minutes': 5})\n", + "verify = True" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# instrument setup with longer data padding\n", + "inst = pysat.Instrument('pysat', 'testing', pad={'minutes': 5, 'days': 1})\n", + "verify = True" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# instrument setup with multi_file_day\n", + "inst = pysat.Instrument('pysat', 'testing', multi_file_day=True)\n", + "verify = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Testing Load Behaviors" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "load command via yr, doy: 2009, 1\n", + "Loaded Data Range: \n", + "2009-01-01 00:00:00 --- 2009-01-01 23:59:59 \n", + "\n", + "load command via yr, doy pairs: 2009, 1, 2009, 3\n", + "Loaded Data Range: \n", + "2009-01-01 00:00:00 --- 2009-01-02 23:59:59 \n", + "\n", + "load command via date: 2009-01-01 00:00:00\n", + "Loaded Data Range: \n", + "2009-01-01 00:00:00 --- 2009-01-01 23:59:59 \n", + "\n", + "load command via dates: 2009-01-01 00:00:00 , 2009-01-03 00:00:00\n", + "Loaded Data Range: \n", + "2009-01-01 00:00:00 --- 2009-01-02 23:59:59 \n", + "\n", + "Load via filename: 2009-01-01.nofile\n", + "Loaded Data Range: \n", + "2009-01-01 00:00:00 --- 2009-01-01 23:59:59 \n", + "\n", + "Load via filenames: 2009-01-01.nofile , 2009-01-02.nofile\n", + "Loaded Data Range: \n", + "2009-01-01 00:00:00 --- 2009-01-02 23:59:59 \n", + "\n", + "Note that inst.date refers to the earliest day loaded, excluding padding\n", + "inst.date: 2009-01-01 00:00:00\n" + ] + } + ], + "source": [ + "# load by yr, doy\n", + "try:\n", + " print('load command via yr, doy: ', '2009, 1')\n", + " inst.load(2009, 1, verifyPad=verify)\n", + " print_range(inst)\n", + "except ValueError as err:\n", + " print(err, '\\n')\n", + "\n", + "# inclusive/exclusive syntax for loading over a range of dates\n", + "print('load command via yr, doy pairs: ', '2009, 1, 2009, 3')\n", + "inst.load(2009, 1, 2009, 3, verifyPad=verify)\n", + "print_range(inst)\n", + "\n", + "# load by date\n", + "try:\n", + " date = dt.datetime(2009, 1, 1)\n", + " print('load command via date: ', date)\n", + " inst.load(date=date, verifyPad=verify)\n", + " print_range(inst)\n", + "except ValueError as err:\n", + " print(err, '\\n')\n", + "\n", + "\n", + "# load by range of dates\n", + "date = dt.datetime(2009, 1, 1)\n", + "end_date = dt.datetime(2009, 1, 3)\n", + "print('load command via dates: ', date, ', ', end_date)\n", + "# inclusive/exclusive syntax for loading over a range of dates\n", + "inst.load(date=date, end_date=end_date, verifyPad=verify)\n", + "print_range(inst)\n", + "\n", + "# load by filename\n", + "try:\n", + " fname = inst.files[366]\n", + " print('Load via filename: ', fname)\n", + " inst.load(fname=fname, verifyPad=verify)\n", + " print_range(inst)\n", + "except ValueError as err:\n", + " print(err, '\\n')\n", + "\n", + "\n", + "# load by filenames\n", + "try:\n", + " fname = inst.files[366]\n", + " stop_fname = inst.files[367]\n", + " # inclusinve syntax for filenames, start and end\n", + " print('Load via filenames: ', fname, ', ', stop_fname)\n", + " inst.load(fname=fname, stop_fname=stop_fname, verifyPad=verify)\n", + " print_range(inst)\n", + "except ValueError as err:\n", + " print(err, '\\n')\n", + "\n", + "\n", + "print('Note that inst.date refers to the earliest day loaded, excluding padding')\n", + "print('inst.date: ', inst.date)" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded Data Range: \n", + "2008-01-01 00:00:00 --- 2010-12-31 00:00:09 \n", + "\n" + ] + } + ], + "source": [ + "# Load all data, first file through last file\n", + "inst = pysat.Instrument('pysat', 'testing', num_daily_samples=10)\n", + "inst.load()\n", + "print_range(inst)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test out iteration behaviours" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run one of the cells below to set up checking out iteration via date, or via file" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "([datetime.datetime(2008, 1, 1, 0, 0)], [datetime.datetime(2008, 1, 11, 0, 0)], '2D', )\n" + ] + } + ], + "source": [ + "# Set bounds on instrument iteration, denoted via date or file limits.\n", + "# Format when setting bounds:\n", + "# bounds = (start dates/files, stop dates/files, stepsize, width of loaded data)\n", + "# Note that start and stop dates are effectively imposed upon inst.date, or the earliest loaded day in the object.\n", + "# When loading with a data width greater than one day, some of the data samples on the last iteration may \n", + "# extend past the supplied bounds, depending upon the step size and data width chosen.\n", + "\n", + "\n", + "# set up bounds via dates\n", + "# iterate with step size of 2, width of 2 (days)\n", + "date = inst.files.start_date\n", + "date2 = inst.files.start_date + pds.DateOffset(days=10)\n", + "date2 = date2.to_pydatetime()\n", + "# format (dt.datetime of list-of, dt.datetime of list-of, pandas frequency string, pandas Date Offset)\n", + "# Ensuring type consistency means that Timestamp date2 needs to become datetime date2.\n", + "# Incidentally, the frequency string '2D' can also be a DateOffset\n", + "bounds = (date, date2, '2D', pds.DateOffset(days=2))\n", + "inst.bounds = bounds\n", + "print(inst.bounds)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "([datetime.datetime(2008, 1, 1, 0, 0)], [datetime.datetime(2008, 1, 11, 0, 0)], '2D', )\n" + ] + } + ], + "source": [ + "# set up bounds via dates\n", + "# iterate with step size of 2, width of 1 (days)\n", + "date = inst.files.start_date\n", + "date2 = inst.files.start_date + pds.DateOffset(days=10)\n", + "date2 = date2.to_pydatetime()\n", + "# format (dt.datetime of list-of, dt.datetime of list-of, pandas frequency string, pandas Date Offset)\n", + "# Ensuring type consistency means that Timestamp date2 needs to become datetime date2.\n", + "# Incidentally, the frequency string '2D' can also be a DateOffset\n", + "bounds = (date, date2, '2D', pds.DateOffset(days=1))\n", + "inst.bounds = bounds\n", + "print(inst.bounds)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "([datetime.datetime(2008, 1, 1, 0, 0), datetime.datetime(2008, 1, 31, 0, 0)], [datetime.datetime(2008, 1, 11, 0, 0), datetime.datetime(2008, 2, 10, 0, 0)], '3D', )\n" + ] + } + ], + "source": [ + "# set up multiple bounds via dates\n", + "# iterate with step size of 3, width of 4 (days)\n", + "\n", + "date = inst.files.start_date\n", + "date2 = inst.files.start_date + pds.DateOffset(days=10)\n", + "date2 = date2.to_pydatetime()\n", + "\n", + "date3 = date2 + pds.DateOffset(days=20)\n", + "date3 = date3.to_pydatetime()\n", + "date4 = date3 + pds.DateOffset(days=10)\n", + "date4 = date4.to_pydatetime()\n", + "\n", + "# format (dt.datetime of list-of, dt.datetime of list-of, pandas frequency string, pandas Date Offset)\n", + "# Ensuring type consistency means that Timestamp date2 needs to become datetime date2.\n", + "# Incidentally, the frequency string '2D' can also be a DateOffset\n", + "bounds = ((date, date3), (date2, date4), '3D', pds.DateOffset(days=4))\n", + "inst.bounds = bounds\n", + "print(inst.bounds)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(array(['2008-01-01.nofile'], dtype='= inst.files.get_index(inst.bounds[0][0])\n", + " assert inst._fid <= inst.files.get_index(inst.bounds[1][0])\n", + "\n", + "# instrument setup\n", + "# We need to start with a new Instrument to ensure we test .next() starting from nothing\n", + "inst = pysat.Instrument('pysat', 'testing')\n", + "inst.bounds = bounds\n", + "print('Instrument bounds: ')\n", + "for start, stop in zip(inst.bounds[0], inst.bounds[1]):\n", + " print('Start: ', start, ' Stop : ', stop)\n", + "print('Step Size : ', bounds[2])\n", + "print('Data Width : ', bounds[3])\n", + "print('\\n')\n", + "\n", + "\n", + "print('Checking iteration via next(). Starting from new object.')\n", + "while True:\n", + " try:\n", + " inst.next()\n", + " print_range(inst)\n", + " check_bounds(inst)\n", + " except StopIteration:\n", + " break\n", + "\n", + "# We need to start with a new Instrument to ensure we test .prev() starting from nothing\n", + "inst = pysat.Instrument('pysat', 'testing')\n", + "inst.bounds = bounds\n", + "print('Checking iteration via prev(). Starting from new object.')\n", + "while True:\n", + " try:\n", + " inst.prev()\n", + " print_range(inst)\n", + " check_bounds(inst)\n", + " except StopIteration:\n", + " break\n", + "\n", + "\n", + "# We need to start with a new Instrument to ensure we test iteration starting from nothing\n", + "inst = pysat.Instrument('pysat', 'testing')\n", + "inst.bounds = bounds\n", + "print('Checking iteration via built-in iteration. Starting from new object.')\n", + "for inst in inst:\n", + " print_range(inst)\n", + " check_bounds(inst)\n", + "\n", + "\n", + "# Instrument currently on last day\n", + "print('One last quick check on .prev(), continuing from last object.')\n", + "inst.prev()\n", + "print_range(inst)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test out orbit behaviors" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [], + "source": [ + "# requires running a bounds setting cell above (in iteration testing prep area)\n", + "orbit_info = {'kind': 'lt', 'index': 'mlt'}\n", + "inst = pysat.Instrument('pysat', 'testing', orbit_info=orbit_info)\n", + "inst.bounds = bounds\n" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded Data Range: \n", + "2008-01-01 00:00:00 --- 2008-01-01 00:38:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 00:39:00 --- 2008-01-01 02:15:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 02:16:00 --- 2008-01-01 03:52:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 03:53:00 --- 2008-01-01 05:29:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 05:30:00 --- 2008-01-01 07:06:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 07:07:00 --- 2008-01-01 08:43:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 08:44:00 --- 2008-01-01 10:20:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 10:21:00 --- 2008-01-01 11:57:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 11:58:00 --- 2008-01-01 13:34:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 13:35:00 --- 2008-01-01 15:11:59 \n", + "\n" + ] + } + ], + "source": [ + "# demonstrate iterating over orbits via next\n", + "for i in np.arange(10):\n", + " inst.orbits.next()\n", + " print_range(inst)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded Data Range: \n", + "2008-01-01 00:00:00 --- 2008-01-01 00:38:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 00:39:00 --- 2008-01-01 02:15:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 02:16:00 --- 2008-01-01 03:52:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 03:53:00 --- 2008-01-01 05:29:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 05:30:00 --- 2008-01-01 07:06:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 07:07:00 --- 2008-01-01 08:43:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 08:44:00 --- 2008-01-01 10:20:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 10:21:00 --- 2008-01-01 11:57:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 11:58:00 --- 2008-01-01 13:34:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 13:35:00 --- 2008-01-01 15:11:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 15:12:00 --- 2008-01-01 16:48:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 16:49:00 --- 2008-01-01 18:25:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 18:26:00 --- 2008-01-01 20:02:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 20:03:00 --- 2008-01-01 21:39:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 21:40:00 --- 2008-01-01 23:16:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-01 23:17:00 --- 2008-01-01 23:59:59 \n", + "\n", + "Loaded Data Range: \n", + "2008-01-03 00:00:00 --- 2008-01-03 01:08:59 \n", + "\n" + ] + } + ], + "source": [ + "# demonstrate iterating over orbits via built-in iteration\n", + "for i, inst in enumerate(inst.orbits):\n", + " print_range(inst)\n", + " if i > 15:\n", + " break\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}