Skip to content

Argentina (add more countries) #73

@thiagovmdon

Description

@thiagovmdon

Related to issue #5

Code for list of stations (and metadata #1):

import requests
import pandas as pd
import numpy as np

# Mapping between INA variable IDs and their English names
VARIABLE_ENGLISH = {
    4:  "discharge-instantaneous",
    40: "discharge-daily-mean",
    68: "discharge-daily-max",
    69: "discharge-daily-min",
    87: "discharge-hourly",
    2:  "stage-instantaneous",
    39: "stage-daily-mean",
    85: "stage-hourly",
    73: "water-temperature",
}

def get_ina_metadata(var_ids=None) -> pd.DataFrame:
    """
    Fetch and return full metadata for INA (Argentina) observation series.

    - Keeps ALL original JSON fields from the API.
    - Renames only the standardized ones:
        id → gauge_id
        estacion.nombre → station_name
        estacion.rio → river
        estacion.geom.coordinates → [longitude, latitude]
    - Adds or fills standardized fields with NaN if missing:
        ['gauge_id', 'station_name', 'river', 'latitude', 'longitude',
         'altitude', 'area', 'country', 'source', 'variable']
    - Adds constant fields:
        country='Argentina', source='INA (DSIyAH)'
    - Prints the URL being fetched for transparency.

    Source: https://alerta.ina.gob.ar/a5/obs/puntual/series
    """

    if var_ids is None:
        var_ids = list(VARIABLE_ENGLISH.keys())

    base_url = "https://alerta.ina.gob.ar/a5/obs/puntual/series"

    all_records = []

    try:
        for vid in var_ids:
            r = requests.get(base_url, params={"var_id": vid}, timeout=30)
            r.raise_for_status()

            # API can return either {"rows": [...]} or a list directly
            data = r.json().get("rows", r.json())
            if not isinstance(data, list):
                continue

            for s in data:
                est = s.get("estacion", {})
                geom = est.get("geom") or {}
                coords = geom.get("coordinates", [None, None])

                # Base record for standardized fields
                rec = {
                    "gauge_id": est.get("id"),
                    "series_id": s.get("id"),
                    "station_name": est.get("nombre"),
                    "river": est.get("rio"),
                    "latitude": coords[1],
                    "longitude": coords[0],
                    "altitude": None,
                    "area": np.nan,
                    "country": est.get("pais", "Argentina"),
                    "source": "Sistema de Información Hidrológica de la Cuenca del Plata - DSIyAH INA",
                    "variable": VARIABLE_ENGLISH.get(vid),
                }

                # Include all raw fields (flatten nested dicts if possible)
                flat_series = pd.json_normalize(s)
                for k, v in flat_series.to_dict(orient="records")[0].items():
                    if k not in rec:
                        rec[k] = v

                all_records.append(rec)

        df = pd.DataFrame(all_records)

        # Ensure standardized fields exist
        std_cols = [
            "gauge_id",
            "station_name",
            "river",
            "latitude",
            "longitude",
            "altitude",
            "area",
            "country",
            "source",
            "variable",
        ]
        for col in std_cols:
            if col not in df.columns:
                df[col] = np.nan

        # Convert coordinate types
        df["latitude"] = pd.to_numeric(df["latitude"], errors="coerce")
        df["longitude"] = pd.to_numeric(df["longitude"], errors="coerce")

        # Keep ALL columns
        return df.reset_index(drop=True)

    except Exception as e:
        print(f"Failed to fetch INA metadata: {e}")
        return pd.DataFrame()

Code for downloading the data

Here I have a question on how to proceed: For Argentina, we need the gauge_id + the series_id for the given variable for the given gauge (unique code for gauge). So in theory 2 codes. The current implementation requires the user to place both. What do you think is the best solution? @kratzert and @simonmoulds ? I thought about using the cached_data, since the argentina_sites.csv file would have both the gauge_id and the series_id.

import requests
import pandas as pd
import numpy as np
from typing import Optional


VARIABLE_MAP = {
    "discharge-instantaneous": 4,
    "discharge-daily-mean": 40,
    "discharge-daily-max": 68,
    "discharge-daily-min": 69,
    "discharge-hourly": 87,
    "stage-instantaneous": 2,
    "stage-daily-mean": 39,
    "stage-hourly": 85,
    "water-temperature": 73,
}


def get_ina_data(
    gauge_id: str | int,
    series_id: str | int,
    variable: str,
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    show_url: bool = False,
) -> pd.DataFrame:
    """
    Download time series from INA using your exact URL pattern
    (query string begins with '&' and no 'request=datos').
    Source and description: https://alerta.ina.gob.ar/pub/gui/apibase

    Returns a DataFrame with columns: ['time', '<variable>'].
    """
    variable = variable.lower()
    if variable not in VARIABLE_MAP:
        raise ValueError(f"Invalid variable '{variable}'. Allowed: {list(VARIABLE_MAP.keys())}")

    if not start_date:
        start_date = "1900-01-01"  # default

    if not end_date:
        end_date = pd.Timestamp.now().date().strftime("%Y-%m-%d")  # today


    var_id = VARIABLE_MAP[variable]

    base_url = "https://alerta.ina.gob.ar/pub/datos/datos"
    url = (
        f"{base_url}"
        f"&timeStart={start_date}"
        f"&timeEnd={end_date}"
        f"&seriesId={series_id}"
        f"&siteCode={gauge_id}"
        f"&varId={var_id}"
        f"&format=json"
    )

    if show_url:
        print("Requesting URL:", url)

    r = requests.get(url, timeout=30)
    r.raise_for_status()
    js = r.json()

    data = js.get("data", [])
    if not data:
        return pd.DataFrame(columns=["time", variable])

    df = pd.DataFrame(data)
    if "timestart" not in df or "valor" not in df:
        return pd.DataFrame(columns=["time", variable])

    df = df.rename(columns={"timestart": "time", "valor": variable})
    df["time"] = pd.to_datetime(df["time"], errors="coerce")
    df[variable] = pd.to_numeric(df[variable], errors="coerce")

    df = df.dropna(subset=["time"])
    df = df.drop_duplicates(subset="time", keep="first")
    df = df.sort_values("time").reset_index(drop=True)
    df = df[["time", variable]]

    return df

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions